Ejemplo n.º 1
0
    def get_thumbnail(self, image_data=None):
        if self.thumbnail_status == 'failed' or self.thumbnail_status == 'generating':
            return None

        thumbnail_path = os.path.join(settings.THUMBNAIL_STORAGE_PATH, self.guid_as_path(), 'thumbnail.png')

        if self.thumbnail_status == 'generated' and default_storage.exists(thumbnail_path):
            return default_storage.open(thumbnail_path)

        try:

            warc_url = None
            image = None

            if image_data:
                image = Image(blob=image_data)
            else:

                if self.screenshot_capture and self.screenshot_capture.status == 'success':
                    warc_url = self.screenshot_capture.url
                else:
                    pdf_capture = self.captures.filter(content_type__startswith='application/pdf').first()
                    if pdf_capture:
                        warc_url = pdf_capture.url

                if warc_url:
                    self.thumbnail_status = 'generating'
                    self.save(update_fields=['thumbnail_status'])

                    headers, data = self.replay_url(warc_url)
                    temp_file = tempfile.NamedTemporaryFile(suffix='.' + warc_url.rsplit('.', 1)[-1])
                    for chunk in data:
                        temp_file.write(chunk)
                    temp_file.flush()
                    image = Image(filename=temp_file.name + "[0]")  # [0] limits ImageMagick to first page of PDF

            if image:
                with imagemagick_temp_dir():
                    with image as opened_img:
                        opened_img.transform(resize='600')
                        # opened_img.resize(600,600)
                        with Image(width=600, height=600) as dst_image:
                            dst_image.composite(opened_img, 0, 0)
                            dst_image.compression_quality = 60
                            default_storage.store_data_to_file(dst_image.make_blob('png'), thumbnail_path, overwrite=True)

                self.thumbnail_status = 'generated'
                self.save(update_fields=['thumbnail_status'])

                return default_storage.open(thumbnail_path)

        except Exception as e:
            print "Thumbnail generation failed for %s: %s" % (self.guid, e)

        self.thumbnail_status = 'failed'
        self.save(update_fields=['thumbnail_status'])
Ejemplo n.º 2
0
def get_pdf(self, link_guid, target_url, base_storage_path, user_agent):
    """
    Download a PDF from the network

    This function is executed via an asynchronous Celery call
    """

    # basic setup
    asset = Asset.objects.get(link_id=link_guid)
    pdf_name = 'cap.pdf'
    pdf_path = os.path.join(base_storage_path, pdf_name)

    # Get the PDF from the network
    pdf_request = requests.get(target_url,
                               stream=True,
                               verify=False,
                               headers={'User-Agent': user_agent})

    # write PDF out to a temp file
    temp = tempfile.NamedTemporaryFile()
    for chunk in pdf_request.iter_content(chunk_size=1024):

        if chunk:  # filter out keep-alive new chunks
            temp.write(chunk)
            temp.flush()

        # Limit our filesize
        if temp.tell() > settings.MAX_ARCHIVE_FILE_SIZE:
            logger.info("PDF capture too big, %s" % target_url)
            save_fields(asset,
                        pdf_capture=Asset.CAPTURE_STATUS_FAILED,
                        image_capture=Asset.CAPTURE_STATUS_FAILED)
            return

    # store temp file
    temp.seek(0)
    pdf_name = default_storage.store_file(temp, pdf_path, overwrite=True)
    save_fields(asset, pdf_capture=pdf_name)

    # Get first page of the PDF and created an image from it
    # Save it to disk as our image capture (likely a temporary measure)
    # The [0] in the filename gets passed through to ImageMagick and limits PDFs to the first page.
    try:
        with imagemagick_temp_dir():
            with Image(filename=temp.name + "[0]") as img:
                image_name = 'cap.png'
                image_path = os.path.join(base_storage_path, image_name)
                default_storage.store_data_to_file(img.make_blob('png'),
                                                   image_path,
                                                   overwrite=True)
                save_fields(asset, image_capture=image_name)
    except Exception as e:
        # errors with the thumbnail aren't dealbreakers -- just log here
        print "Error creating PDF thumbnail of %s: %s" % (target_url, e)
        save_fields(asset, image_capture=Asset.CAPTURE_STATUS_FAILED)
Ejemplo n.º 3
0
def get_pdf(self, link_guid, target_url, base_storage_path, user_agent):
    """
    Download a PDF from the network

    This function is executed via an asynchronous Celery call
    """

    # basic setup
    asset = Asset.objects.get(link_id=link_guid)
    pdf_name = 'cap.pdf'
    pdf_path = os.path.join(base_storage_path, pdf_name)

    # Get the PDF from the network
    pdf_request = requests.get(target_url, stream = True, verify=False,
        headers={'User-Agent': user_agent})

    # write PDF out to a temp file
    temp = tempfile.NamedTemporaryFile()
    for chunk in pdf_request.iter_content(chunk_size=1024):

        if chunk: # filter out keep-alive new chunks
            temp.write(chunk)
            temp.flush()

        # Limit our filesize
        if temp.tell() > settings.MAX_ARCHIVE_FILE_SIZE:
            logger.info("PDF capture too big, %s" % target_url)
            save_fields(asset,
                        pdf_capture=Asset.CAPTURE_STATUS_FAILED,
                        image_capture=Asset.CAPTURE_STATUS_FAILED)
            return

    # store temp file
    temp.seek(0)
    pdf_name = default_storage.store_file(temp, pdf_path, overwrite=True)
    save_fields(asset, pdf_capture=pdf_name)
    
    # Get first page of the PDF and created an image from it
    # Save it to disk as our image capture (likely a temporary measure)
    # The [0] in the filename gets passed through to ImageMagick and limits PDFs to the first page.
    try:
        with imagemagick_temp_dir():
            with Image(filename=temp.name+"[0]") as img:
                image_name = 'cap.png'
                image_path = os.path.join(base_storage_path, image_name)
                default_storage.store_data_to_file(img.make_blob('png'), image_path, overwrite=True)
                save_fields(asset, image_capture=image_name)
    except Exception as e:
        # errors with the thumbnail aren't dealbreakers -- just log here
        print "Error creating PDF thumbnail of %s: %s" % (target_url, e)
        save_fields(asset, image_capture=Asset.CAPTURE_STATUS_FAILED)
Ejemplo n.º 4
0
 def favicon_thread():
     # print "Fetching favicon from %s ..." % favicon_url
     try:
         favicon_response = proxied_get_request(favicon_url)
         assert favicon_response.ok
     except (requests.ConnectionError, requests.Timeout, AssertionError):
         # print "Couldn't get favicon"
         return
     favicon_file = favicon_url.rsplit('/',1)[-1]
     default_storage.store_data_to_file(favicon_response.content,
                                        os.path.join(base_storage_path, favicon_file),
                                        overwrite=True)
     save_fields(asset, favicon=favicon_file)
     print "Saved favicon as %s" % favicon_file
Ejemplo n.º 5
0
 def favicon_thread():
     # print "Fetching favicon from %s ..." % favicon_url
     try:
         favicon_response = proxied_get_request(favicon_url)
         assert favicon_response.ok
     except (requests.ConnectionError, requests.Timeout,
             AssertionError):
         # print "Couldn't get favicon"
         return
     favicon_file = favicon_url.rsplit('/', 1)[-1]
     default_storage.store_data_to_file(favicon_response.content,
                                        os.path.join(
                                            base_storage_path,
                                            favicon_file),
                                        overwrite=True)
     save_fields(asset, favicon=favicon_file)
     print "Saved favicon as %s" % favicon_file
Ejemplo n.º 6
0
def get_pdf(self, link_guid, target_url, base_storage_path, user_agent):
    """
    Download a PDF from the network

    This function is executed via an asynchronous Celery call
    """

    # basic setup
    asset_query = get_asset_query(link_guid)
    pdf_name = 'cap.pdf'
    pdf_path = os.path.join(base_storage_path, pdf_name)

    # Get the PDF from the network
    pdf_request = requests.get(target_url, stream = True, verify=False,
        headers={'User-Agent': user_agent})

    # write PDF out to a temp file
    temp = tempfile.NamedTemporaryFile()
    for chunk in pdf_request.iter_content(chunk_size=1024):

        if chunk: # filter out keep-alive new chunks
            temp.write(chunk)
            temp.flush()

        # Limit our filesize
        if temp.tell() > settings.MAX_ARCHIVE_FILE_SIZE:
            logger.info("PDF capture too big, %s" % target_url)
            asset_query.update(pdf_capture='failed', image_capture='failed')
            return

    # store temp file
    temp.seek(0)
    pdf_name = default_storage.store_file(temp, pdf_path)
    asset_query.update(pdf_capture=pdf_name)
    
    # Get first page of the PDF and created an image from it
    # Save it to disk as our image capture (likely a temporary measure)
    with Image(filename=temp.name) as img:
        image_name = 'cap.png'
        image_path = os.path.join(base_storage_path, image_name)
        default_storage.store_data_to_file(img.make_blob('png'), image_path, overwrite=True)
        asset_query.update(image_capture=image_name)
Ejemplo n.º 7
0
    def get_thumbnail(self, image_data=None):
        if self.thumbnail_status == 'failed' or self.thumbnail_status == 'generating':
            return None

        thumbnail_path = os.path.join(settings.THUMBNAIL_STORAGE_PATH,
                                      self.guid_as_path(), 'thumbnail.png')

        if self.thumbnail_status == 'generated' and default_storage.exists(
                thumbnail_path):
            return default_storage.open(thumbnail_path)

        try:

            warc_url = None
            image = None

            if image_data:
                image = Image(blob=image_data)
            else:

                if self.screenshot_capture and self.screenshot_capture.status == 'success':
                    warc_url = self.screenshot_capture.url
                else:
                    pdf_capture = self.captures.filter(
                        content_type__startswith='application/pdf').first()
                    if pdf_capture:
                        warc_url = pdf_capture.url

                if warc_url:
                    self.thumbnail_status = 'generating'
                    self.save(update_fields=['thumbnail_status'])

                    headers, data = self.replay_url(warc_url)
                    temp_file = tempfile.NamedTemporaryFile(
                        suffix='.' + warc_url.rsplit('.', 1)[-1])
                    for chunk in data:
                        temp_file.write(chunk)
                    temp_file.flush()
                    image = Image(
                        filename=temp_file.name +
                        "[0]")  # [0] limits ImageMagick to first page of PDF

            if image:
                with imagemagick_temp_dir():
                    with image as opened_img:
                        opened_img.transform(resize='600')
                        # opened_img.resize(600,600)
                        with Image(width=600, height=600) as dst_image:
                            dst_image.composite(opened_img, 0, 0)
                            dst_image.compression_quality = 60
                            default_storage.store_data_to_file(
                                dst_image.make_blob('png'),
                                thumbnail_path,
                                overwrite=True)

                self.thumbnail_status = 'generated'
                self.save(update_fields=['thumbnail_status'])

                return default_storage.open(thumbnail_path)

        except Exception as e:
            print "Thumbnail generation failed for %s: %s" % (self.guid, e)

        self.thumbnail_status = 'failed'
        self.save(update_fields=['thumbnail_status'])
Ejemplo n.º 8
0
def save_screenshot(driver, image_path):
    """ Given selenium webdriver and path, save screenshot using Django's default_storage. """
    png_data = driver.get_screenshot_as_png()
    return default_storage.store_data_to_file(png_data, image_path, overwrite=True)