Beispiel #1
0
def _scrape_media(url,
                  autoplay=False,
                  maxwidth=600,
                  force=False,
                  use_cache=False,
                  max_cache_age=None):
    media = None
    autoplay = bool(autoplay)
    maxwidth = int(maxwidth)

    # Use media from the cache (if available)
    if not force and use_cache:
        mediaByURL = MediaByURL.get(url,
                                    autoplay=autoplay,
                                    maxwidth=maxwidth,
                                    max_cache_age=max_cache_age)
        if mediaByURL:
            media = mediaByURL.media

    # Otherwise, scrape it
    if not media:
        media_object = secure_media_object = None
        thumbnail_image = thumbnail_url = thumbnail_size = None

        scraper = Scraper.for_url(url, autoplay=autoplay)
        try:
            thumbnail_image, media_object, secure_media_object = (
                scraper.scrape())
        except (HTTPError, URLError) as e:
            if use_cache:
                MediaByURL.add_error(url,
                                     str(e),
                                     autoplay=autoplay,
                                     maxwidth=maxwidth)
            return None

        # the scraper should be able to make a media embed out of the
        # media object it just gave us. if not, null out the media object
        # to protect downstream code
        if media_object and not scraper.media_embed(media_object):
            print "%s made a bad media obj for url %s" % (scraper, url)
            media_object = None

        if (secure_media_object
                and not scraper.media_embed(secure_media_object)):
            print "%s made a bad secure media obj for url %s" % (scraper, url)
            secure_media_object = None

        if thumbnail_image:
            thumbnail_size = thumbnail_image.size
            thumbnail_url = upload_media(thumbnail_image)

        media = Media(media_object, secure_media_object, thumbnail_url,
                      thumbnail_size)

    # Store the media in the cache (if requested), possibly extending the ttl
    if use_cache and media is not ERROR_MEDIA:
        MediaByURL.add(url, media, autoplay=autoplay, maxwidth=maxwidth)

    return media
Beispiel #2
0
def _scrape_media(url, autoplay=False, maxwidth=600, force=False,
                  save_thumbnail=True, use_cache=False, max_cache_age=None):
    media = None
    autoplay = bool(autoplay)
    maxwidth = int(maxwidth)

    # Use media from the cache (if available)
    if not force and use_cache:
        mediaByURL = MediaByURL.get(url,
                                    autoplay=autoplay,
                                    maxwidth=maxwidth,
                                    max_cache_age=max_cache_age)
        if mediaByURL:
            media = mediaByURL.media

    # Otherwise, scrape it
    if not media:
        media_object = secure_media_object = None
        thumbnail_image = thumbnail_url = thumbnail_size = None

        scraper = Scraper.for_url(url, autoplay=autoplay)
        try:
            thumbnail_image, media_object, secure_media_object = (
                scraper.scrape())
        except (HTTPError, URLError) as e:
            if use_cache:
                MediaByURL.add_error(url, str(e),
                                     autoplay=autoplay,
                                     maxwidth=maxwidth)
            return None

        # the scraper should be able to make a media embed out of the
        # media object it just gave us. if not, null out the media object
        # to protect downstream code
        if media_object and not scraper.media_embed(media_object):
            print "%s made a bad media obj for url %s" % (scraper, url)
            media_object = None

        if (secure_media_object and
            not scraper.media_embed(secure_media_object)):
            print "%s made a bad secure media obj for url %s" % (scraper, url)
            secure_media_object = None

        if thumbnail_image and save_thumbnail:
            thumbnail_size = thumbnail_image.size
            thumbnail_url = upload_media(thumbnail_image)

        media = Media(media_object, secure_media_object,
                      thumbnail_url, thumbnail_size)

    # Store the media in the cache (if requested), possibly extending the ttl
    use_cache = use_cache and save_thumbnail    # don't cache partial scrape
    if use_cache and media is not ERROR_MEDIA:
        MediaByURL.add(url,
                       media,
                       autoplay=autoplay,
                       maxwidth=maxwidth)

    return media
Beispiel #3
0
def _scrape_media(url, autoplay=False, maxwidth=600, force=False,
                  save_thumbnail=True, use_cache=False, max_cache_age=None,
                  use_youtube_scraper=False):
    media = None
    autoplay = bool(autoplay)
    maxwidth = int(maxwidth)

    # Use media from the cache (if available)
    if not force and use_cache:
        mediaByURL = MediaByURL.get(url,
                                    autoplay=autoplay,
                                    maxwidth=maxwidth,
                                    max_cache_age=max_cache_age)
        if mediaByURL:
            media = mediaByURL.media

    # Otherwise, scrape it if thumbnail is not present
    if not media or not media.thumbnail_url:
        media_object = secure_media_object = None
        thumbnail_image = thumbnail_url = thumbnail_size = None

        scraper = Scraper.for_url(url, autoplay=autoplay,
                                  use_youtube_scraper=use_youtube_scraper)
        try:
            thumbnail_image, preview_object, media_object, secure_media_object = (
                scraper.scrape())
        except (HTTPError, URLError) as e:
            if use_cache:
                MediaByURL.add_error(url, str(e),
                                     autoplay=autoplay,
                                     maxwidth=maxwidth)
            return None

        # the scraper should be able to make a media embed out of the
        # media object it just gave us. if not, null out the media object
        # to protect downstream code
        if media_object and not scraper.media_embed(media_object):
            print "%s made a bad media obj for url %s" % (scraper, url)
            media_object = None

        if (secure_media_object and
            not scraper.media_embed(secure_media_object)):
            print "%s made a bad secure media obj for url %s" % (scraper, url)
            secure_media_object = None

        # If thumbnail can't be found, attempt again using _ThumbnailOnlyScraper
        # This should fix bugs that occur when embed.ly caches links before the 
        # thumbnail is available
        if (not thumbnail_image and 
                not isinstance(scraper, _ThumbnailOnlyScraper)):
            scraper = _ThumbnailOnlyScraper(url)
            try:
                thumbnail_image, preview_object, _, _ = scraper.scrape()
            except (HTTPError, URLError) as e:
                use_cache = False

        if thumbnail_image and save_thumbnail:
            thumbnail_size = thumbnail_image.size
            thumbnail_url = upload_media(thumbnail_image)
        else:
            # don't cache if thumbnail is absent
            use_cache = False

        media = Media(media_object, secure_media_object, preview_object,
                      thumbnail_url, thumbnail_size)

    if use_cache and save_thumbnail and media is not ERROR_MEDIA:
        # Store the media in the cache, possibly extending the ttl
        MediaByURL.add(url,
                       media,
                       autoplay=autoplay,
                       maxwidth=maxwidth)

    return media
Beispiel #4
0
def _scrape_media(url, autoplay=False, maxwidth=600, force=False,
                  save_thumbnail=True, use_cache=False, max_cache_age=None):
    media = None
    autoplay = bool(autoplay)
    maxwidth = int(maxwidth)

    # Use media from the cache (if available)
    if not force and use_cache:
        mediaByURL = MediaByURL.get(url,
                                    autoplay=autoplay,
                                    maxwidth=maxwidth,
                                    max_cache_age=max_cache_age)
        if mediaByURL:
            media = mediaByURL.media

    # Otherwise, scrape it
    if not media:
        media_object = secure_media_object = None
        thumbnail_image = thumbnail_url = thumbnail_size = None

        scraper = Scraper.for_url(url, autoplay=autoplay)
        try:
            thumbnail_image, preview_object, media_object, secure_media_object = (
                scraper.scrape())
        except (HTTPError, URLError) as e:
            if use_cache:
                MediaByURL.add_error(url, str(e),
                                     autoplay=autoplay,
                                     maxwidth=maxwidth)
            return None

        # the scraper should be able to make a media embed out of the
        # media object it just gave us. if not, null out the media object
        # to protect downstream code
        if media_object and not scraper.media_embed(media_object):
            print "%s made a bad media obj for url %s" % (scraper, url)
            media_object = None

        if (secure_media_object and
            not scraper.media_embed(secure_media_object)):
            print "%s made a bad secure media obj for url %s" % (scraper, url)
            secure_media_object = None

        # If thumbnail can't be found, attempt again using _ThumbnailOnlyScraper
        # This should fix bugs that occur when embed.ly caches links before the 
        # thumbnail is available
        if (not thumbnail_image and 
                not isinstance(scraper, _ThumbnailOnlyScraper)):
            scraper = _ThumbnailOnlyScraper(url)
            try:
                thumbnail_image, preview_object, _, _ = scraper.scrape()
            except (HTTPError, URLError) as e:
                use_cache = False

        if thumbnail_image and save_thumbnail:
            thumbnail_size = thumbnail_image.size
            thumbnail_url = upload_media(thumbnail_image)
        else:
            # don't cache if thumbnail is absent
            use_cache = False

        media = Media(media_object, secure_media_object, preview_object,
                      thumbnail_url, thumbnail_size)

    if use_cache and save_thumbnail and media is not ERROR_MEDIA:
        # Store the media in the cache, possibly extending the ttl
        MediaByURL.add(url,
                       media,
                       autoplay=autoplay,
                       maxwidth=maxwidth)

    return media