def get_thumbnail(self, image_data=None): if self.thumbnail_status == 'failed' or self.thumbnail_status == 'generating': return None thumbnail_path = os.path.join(settings.THUMBNAIL_STORAGE_PATH, self.guid_as_path(), 'thumbnail.png') if self.thumbnail_status == 'generated' and default_storage.exists(thumbnail_path): return default_storage.open(thumbnail_path) try: warc_url = None image = None if image_data: image = Image(blob=image_data) else: if self.screenshot_capture and self.screenshot_capture.status == 'success': warc_url = self.screenshot_capture.url else: pdf_capture = self.captures.filter(content_type__startswith='application/pdf').first() if pdf_capture: warc_url = pdf_capture.url if warc_url: self.thumbnail_status = 'generating' self.save(update_fields=['thumbnail_status']) headers, data = self.replay_url(warc_url) temp_file = tempfile.NamedTemporaryFile(suffix='.' + warc_url.rsplit('.', 1)[-1]) for chunk in data: temp_file.write(chunk) temp_file.flush() image = Image(filename=temp_file.name + "[0]") # [0] limits ImageMagick to first page of PDF if image: with imagemagick_temp_dir(): with image as opened_img: opened_img.transform(resize='600') # opened_img.resize(600,600) with Image(width=600, height=600) as dst_image: dst_image.composite(opened_img, 0, 0) dst_image.compression_quality = 60 default_storage.store_data_to_file(dst_image.make_blob('png'), thumbnail_path, overwrite=True) self.thumbnail_status = 'generated' self.save(update_fields=['thumbnail_status']) return default_storage.open(thumbnail_path) except Exception as e: print "Thumbnail generation failed for %s: %s" % (self.guid, e) self.thumbnail_status = 'failed' self.save(update_fields=['thumbnail_status'])
def get_pdf(self, link_guid, target_url, base_storage_path, user_agent): """ Download a PDF from the network This function is executed via an asynchronous Celery call """ # basic setup asset = Asset.objects.get(link_id=link_guid) pdf_name = 'cap.pdf' pdf_path = os.path.join(base_storage_path, pdf_name) # Get the PDF from the network pdf_request = requests.get(target_url, stream=True, verify=False, headers={'User-Agent': user_agent}) # write PDF out to a temp file temp = tempfile.NamedTemporaryFile() for chunk in pdf_request.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks temp.write(chunk) temp.flush() # Limit our filesize if temp.tell() > settings.MAX_ARCHIVE_FILE_SIZE: logger.info("PDF capture too big, %s" % target_url) save_fields(asset, pdf_capture=Asset.CAPTURE_STATUS_FAILED, image_capture=Asset.CAPTURE_STATUS_FAILED) return # store temp file temp.seek(0) pdf_name = default_storage.store_file(temp, pdf_path, overwrite=True) save_fields(asset, pdf_capture=pdf_name) # Get first page of the PDF and created an image from it # Save it to disk as our image capture (likely a temporary measure) # The [0] in the filename gets passed through to ImageMagick and limits PDFs to the first page. try: with imagemagick_temp_dir(): with Image(filename=temp.name + "[0]") as img: image_name = 'cap.png' image_path = os.path.join(base_storage_path, image_name) default_storage.store_data_to_file(img.make_blob('png'), image_path, overwrite=True) save_fields(asset, image_capture=image_name) except Exception as e: # errors with the thumbnail aren't dealbreakers -- just log here print "Error creating PDF thumbnail of %s: %s" % (target_url, e) save_fields(asset, image_capture=Asset.CAPTURE_STATUS_FAILED)
def get_pdf(self, link_guid, target_url, base_storage_path, user_agent): """ Download a PDF from the network This function is executed via an asynchronous Celery call """ # basic setup asset = Asset.objects.get(link_id=link_guid) pdf_name = 'cap.pdf' pdf_path = os.path.join(base_storage_path, pdf_name) # Get the PDF from the network pdf_request = requests.get(target_url, stream = True, verify=False, headers={'User-Agent': user_agent}) # write PDF out to a temp file temp = tempfile.NamedTemporaryFile() for chunk in pdf_request.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks temp.write(chunk) temp.flush() # Limit our filesize if temp.tell() > settings.MAX_ARCHIVE_FILE_SIZE: logger.info("PDF capture too big, %s" % target_url) save_fields(asset, pdf_capture=Asset.CAPTURE_STATUS_FAILED, image_capture=Asset.CAPTURE_STATUS_FAILED) return # store temp file temp.seek(0) pdf_name = default_storage.store_file(temp, pdf_path, overwrite=True) save_fields(asset, pdf_capture=pdf_name) # Get first page of the PDF and created an image from it # Save it to disk as our image capture (likely a temporary measure) # The [0] in the filename gets passed through to ImageMagick and limits PDFs to the first page. try: with imagemagick_temp_dir(): with Image(filename=temp.name+"[0]") as img: image_name = 'cap.png' image_path = os.path.join(base_storage_path, image_name) default_storage.store_data_to_file(img.make_blob('png'), image_path, overwrite=True) save_fields(asset, image_capture=image_name) except Exception as e: # errors with the thumbnail aren't dealbreakers -- just log here print "Error creating PDF thumbnail of %s: %s" % (target_url, e) save_fields(asset, image_capture=Asset.CAPTURE_STATUS_FAILED)
def favicon_thread(): # print "Fetching favicon from %s ..." % favicon_url try: favicon_response = proxied_get_request(favicon_url) assert favicon_response.ok except (requests.ConnectionError, requests.Timeout, AssertionError): # print "Couldn't get favicon" return favicon_file = favicon_url.rsplit('/',1)[-1] default_storage.store_data_to_file(favicon_response.content, os.path.join(base_storage_path, favicon_file), overwrite=True) save_fields(asset, favicon=favicon_file) print "Saved favicon as %s" % favicon_file
def favicon_thread(): # print "Fetching favicon from %s ..." % favicon_url try: favicon_response = proxied_get_request(favicon_url) assert favicon_response.ok except (requests.ConnectionError, requests.Timeout, AssertionError): # print "Couldn't get favicon" return favicon_file = favicon_url.rsplit('/', 1)[-1] default_storage.store_data_to_file(favicon_response.content, os.path.join( base_storage_path, favicon_file), overwrite=True) save_fields(asset, favicon=favicon_file) print "Saved favicon as %s" % favicon_file
def get_pdf(self, link_guid, target_url, base_storage_path, user_agent): """ Download a PDF from the network This function is executed via an asynchronous Celery call """ # basic setup asset_query = get_asset_query(link_guid) pdf_name = 'cap.pdf' pdf_path = os.path.join(base_storage_path, pdf_name) # Get the PDF from the network pdf_request = requests.get(target_url, stream = True, verify=False, headers={'User-Agent': user_agent}) # write PDF out to a temp file temp = tempfile.NamedTemporaryFile() for chunk in pdf_request.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks temp.write(chunk) temp.flush() # Limit our filesize if temp.tell() > settings.MAX_ARCHIVE_FILE_SIZE: logger.info("PDF capture too big, %s" % target_url) asset_query.update(pdf_capture='failed', image_capture='failed') return # store temp file temp.seek(0) pdf_name = default_storage.store_file(temp, pdf_path) asset_query.update(pdf_capture=pdf_name) # Get first page of the PDF and created an image from it # Save it to disk as our image capture (likely a temporary measure) with Image(filename=temp.name) as img: image_name = 'cap.png' image_path = os.path.join(base_storage_path, image_name) default_storage.store_data_to_file(img.make_blob('png'), image_path, overwrite=True) asset_query.update(image_capture=image_name)
def get_thumbnail(self, image_data=None): if self.thumbnail_status == 'failed' or self.thumbnail_status == 'generating': return None thumbnail_path = os.path.join(settings.THUMBNAIL_STORAGE_PATH, self.guid_as_path(), 'thumbnail.png') if self.thumbnail_status == 'generated' and default_storage.exists( thumbnail_path): return default_storage.open(thumbnail_path) try: warc_url = None image = None if image_data: image = Image(blob=image_data) else: if self.screenshot_capture and self.screenshot_capture.status == 'success': warc_url = self.screenshot_capture.url else: pdf_capture = self.captures.filter( content_type__startswith='application/pdf').first() if pdf_capture: warc_url = pdf_capture.url if warc_url: self.thumbnail_status = 'generating' self.save(update_fields=['thumbnail_status']) headers, data = self.replay_url(warc_url) temp_file = tempfile.NamedTemporaryFile( suffix='.' + warc_url.rsplit('.', 1)[-1]) for chunk in data: temp_file.write(chunk) temp_file.flush() image = Image( filename=temp_file.name + "[0]") # [0] limits ImageMagick to first page of PDF if image: with imagemagick_temp_dir(): with image as opened_img: opened_img.transform(resize='600') # opened_img.resize(600,600) with Image(width=600, height=600) as dst_image: dst_image.composite(opened_img, 0, 0) dst_image.compression_quality = 60 default_storage.store_data_to_file( dst_image.make_blob('png'), thumbnail_path, overwrite=True) self.thumbnail_status = 'generated' self.save(update_fields=['thumbnail_status']) return default_storage.open(thumbnail_path) except Exception as e: print "Thumbnail generation failed for %s: %s" % (self.guid, e) self.thumbnail_status = 'failed' self.save(update_fields=['thumbnail_status'])
def save_screenshot(driver, image_path): """ Given selenium webdriver and path, save screenshot using Django's default_storage. """ png_data = driver.get_screenshot_as_png() return default_storage.store_data_to_file(png_data, image_path, overwrite=True)