def close_warc_after_writing(self, out): out.flush() out.seek(0) default_storage.store_file(out, self.warc_storage_file(), overwrite=True) out.close()
def compress_link_assets(*args, **kwargs): """ This task creates a zipfile containing the assets of a given Perma link. The zip file does *not* contain mutable status data about the link (e.g. whether it's vested or not), only immutable asset metadata. This is a Celery task so that it can be run after the tasks that generate the assets are finished, which we arrange for by means of a chord. Thus, the first positional arguments of this function will be return value of those tasks. We thus don't rely on positional arguments and retrieve all of our arguments via kwargs. """ # fetch link and asset try: guid = kwargs['guid'] except KeyError: raise TypeError("compress_link_assets() requires a guid keyword argument") target_link = get_object_or_404(Link, guid=guid) target_asset = get_object_or_404(Asset, link=target_link) # build metadata metadata = { "guid": guid, "submitted_url": target_link.submitted_url, "creation_timestamp": serialize_datetime(target_link.creation_timestamp), "submitted_title": target_link.submitted_title, } # Here we are going to open a temporary file to store our zip data to. # Because of @run_in_tempdir we have already chdir'd to a temp dir. # Next we will use default_storage to copy each file in target_asset.base_storage_path # (which may come from the local disk or a remote location like S3) # to the temp dir, and then from there into the open zip file. # This double copy is necessary because zipfile.write expects a file path, # not a file handle. temp_file = tempfile.TemporaryFile() base_storage_path_without_guid = os.path.dirname(target_asset.base_storage_path) with zipfile.ZipFile(temp_file, "w") as zipfh: for root, dirs, files in default_storage.walk(target_asset.base_storage_path): for file in files: source_file_path = os.path.join(root, file) # e.g. 2014/6/10/18/37/1234-ABCD/cap.png dest_file_path = source_file_path.replace(base_storage_path_without_guid+"/", '', 1) # e.g. 1234-ABCD/cap.png with default_storage.open(source_file_path, 'rb') as source_file: zipfh.writestr(dest_file_path, source_file.read()) # write metadata to 1234-ABCD/metadata.json zipfh.writestr(os.path.join(guid, "metadata.json"), json.dumps(metadata)) # now our zip file has been written, we can store it to default_storage temp_file.seek(0) zipfile_storage_path = os.path.join(settings.MEDIA_ARCHIVES_ROOT, target_asset.base_storage_path+".zip") default_storage.store_file(temp_file, zipfile_storage_path, overwrite=True)
def preserve_perma_warc(guid, timestamp, destination): """ Context manager for opening a perma warc, ready to receive warc records. Safely closes and saves the file to storage when context is exited. """ out = tempfile.TemporaryFile() write_perma_warc_header(out, guid, timestamp) try: yield out finally: out.flush() out.seek(0) default_storage.store_file(out, destination, overwrite=True) out.close()
def preserve_perma_warc(guid, timestamp, destination): """ Inside this context manager, the environment variable MAGICK_TEMPORARY_PATH will be set to a temp path that gets deleted when the context closes. This stops Wand's calls to ImageMagick leaving temp files around. """ out = tempfile.TemporaryFile() write_perma_warc_header(out, guid, timestamp) try: yield out finally: out.flush() out.seek(0) default_storage.store_file(out, destination, overwrite=True) out.close()
def preserve_perma_warc(guid, timestamp, destination): """ Context manager for opening a perma warc, ready to receive warc records. Safely closes and saves the file to storage when context is exited. """ # mode set to 'ab+' as a workaround for https://bugs.python.org/issue25341 out = tempfile.TemporaryFile('ab+') write_perma_warc_header(out, guid, timestamp) try: yield out finally: out.flush() out.seek(0) default_storage.store_file(out, destination, overwrite=True) out.close()
def get_pdf(link_guid, target_url, base_storage_path, user_agent): """ Download a PDF from the network This function is executed via an asynchronous Celery call """ # basic setup asset_query = get_asset_query(link_guid) pdf_name = 'cap.pdf' pdf_path = os.path.join(base_storage_path, pdf_name) # Get the PDF from the network pdf_request = requests.get(target_url, stream = True, verify=False, headers={'User-Agent': user_agent}) # write PDF out to a temp file temp = tempfile.TemporaryFile() for chunk in pdf_request.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks temp.write(chunk) temp.flush() # Limit our filesize if temp.tell() > settings.MAX_ARCHIVE_FILE_SIZE: logger.info("PDF capture too big, %s" % target_url) asset_query.update(pdf_capture='failed') return # store temp file temp.seek(0) pdf_name = default_storage.store_file(temp, pdf_path) asset_query.update(pdf_capture=pdf_name)
def obj_create(self, bundle, **kwargs): # We've received a request to archive a URL. That process is managed here. # We create a new entry in our datastore and pass the work off to our indexing # workers. They do their thing, updating the model as they go. When we get some minimum # set of results we can present the user (a guid for the link), we respond back. if settings.READ_ONLY_MODE: raise ImmediateHttpResponse(response=self.error_response( bundle.request, { 'archives': { '__all__': "Perma has paused archive creation for scheduled maintenance. Please try again shortly." }, 'reason': "Perma has paused archive creation for scheduled maintenance. Please try again shortly.", })) # Runs validation (exception thrown if invalid), sets properties and saves the object bundle = super(LinkResource, self).obj_create(bundle, created_by=bundle.request.user) asset = Asset(link=bundle.obj) uploaded_file = bundle.data.get('file') if uploaded_file: # normalize file name to upload.jpg, upload.png, upload.gif, or upload.pdf mime_type = get_mime_type(uploaded_file.name) file_name = 'upload.%s' % mime_type_lookup[mime_type][ 'new_extension'] file_path = os.path.join(asset.base_storage_path, file_name) uploaded_file.file.seek(0) file_name = default_storage.store_file(uploaded_file, file_path) if mime_type == 'application/pdf': asset.pdf_capture = file_name else: asset.image_capture = file_name asset.user_upload = True asset.user_upload_file_name = uploaded_file.name asset.save() else: asset.image_capture = Asset.CAPTURE_STATUS_PENDING # If it appears as if we're trying to archive a PDF, only run our PDF retrieval tool if asset.link.media_type == 'pdf': asset.pdf_capture = Asset.CAPTURE_STATUS_PENDING task = get_pdf else: # else, it's not a PDF. Let's try our best to retrieve what we can asset.warc_capture = Asset.CAPTURE_STATUS_PENDING task = proxy_capture asset.save() run_task( task.s(asset.link.guid, asset.link.submitted_url, asset.base_storage_path, bundle.request.META.get('HTTP_USER_AGENT', ''))) return bundle
def get_pdf(self, link_guid, target_url, base_storage_path, user_agent): """ Download a PDF from the network This function is executed via an asynchronous Celery call """ # basic setup asset = Asset.objects.get(link_id=link_guid) pdf_name = 'cap.pdf' pdf_path = os.path.join(base_storage_path, pdf_name) # Get the PDF from the network pdf_request = requests.get(target_url, stream=True, verify=False, headers={'User-Agent': user_agent}) # write PDF out to a temp file temp = tempfile.NamedTemporaryFile() for chunk in pdf_request.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks temp.write(chunk) temp.flush() # Limit our filesize if temp.tell() > settings.MAX_ARCHIVE_FILE_SIZE: logger.info("PDF capture too big, %s" % target_url) save_fields(asset, pdf_capture=Asset.CAPTURE_STATUS_FAILED, image_capture=Asset.CAPTURE_STATUS_FAILED) return # store temp file temp.seek(0) pdf_name = default_storage.store_file(temp, pdf_path, overwrite=True) save_fields(asset, pdf_capture=pdf_name) # Get first page of the PDF and created an image from it # Save it to disk as our image capture (likely a temporary measure) # The [0] in the filename gets passed through to ImageMagick and limits PDFs to the first page. try: with imagemagick_temp_dir(): with Image(filename=temp.name + "[0]") as img: image_name = 'cap.png' image_path = os.path.join(base_storage_path, image_name) default_storage.store_data_to_file(img.make_blob('png'), image_path, overwrite=True) save_fields(asset, image_capture=image_name) except Exception as e: # errors with the thumbnail aren't dealbreakers -- just log here print "Error creating PDF thumbnail of %s: %s" % (target_url, e) save_fields(asset, image_capture=Asset.CAPTURE_STATUS_FAILED)
def get_pdf(self, link_guid, target_url, base_storage_path, user_agent): """ Download a PDF from the network This function is executed via an asynchronous Celery call """ # basic setup asset = Asset.objects.get(link_id=link_guid) pdf_name = 'cap.pdf' pdf_path = os.path.join(base_storage_path, pdf_name) # Get the PDF from the network pdf_request = requests.get(target_url, stream = True, verify=False, headers={'User-Agent': user_agent}) # write PDF out to a temp file temp = tempfile.NamedTemporaryFile() for chunk in pdf_request.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks temp.write(chunk) temp.flush() # Limit our filesize if temp.tell() > settings.MAX_ARCHIVE_FILE_SIZE: logger.info("PDF capture too big, %s" % target_url) save_fields(asset, pdf_capture=Asset.CAPTURE_STATUS_FAILED, image_capture=Asset.CAPTURE_STATUS_FAILED) return # store temp file temp.seek(0) pdf_name = default_storage.store_file(temp, pdf_path, overwrite=True) save_fields(asset, pdf_capture=pdf_name) # Get first page of the PDF and created an image from it # Save it to disk as our image capture (likely a temporary measure) # The [0] in the filename gets passed through to ImageMagick and limits PDFs to the first page. try: with imagemagick_temp_dir(): with Image(filename=temp.name+"[0]") as img: image_name = 'cap.png' image_path = os.path.join(base_storage_path, image_name) default_storage.store_data_to_file(img.make_blob('png'), image_path, overwrite=True) save_fields(asset, image_capture=image_name) except Exception as e: # errors with the thumbnail aren't dealbreakers -- just log here print "Error creating PDF thumbnail of %s: %s" % (target_url, e) save_fields(asset, image_capture=Asset.CAPTURE_STATUS_FAILED)
def obj_create(self, bundle, **kwargs): # We've received a request to archive a URL. That process is managed here. # We create a new entry in our datastore and pass the work off to our indexing # workers. They do their thing, updating the model as they go. When we get some minimum # set of results we can present the user (a guid for the link), we respond back. if settings.READ_ONLY_MODE: raise ImmediateHttpResponse(response=self.error_response(bundle.request, { 'archives': {'__all__': "Perma has paused archive creation for scheduled maintenance. Please try again shortly."}, 'reason': "Perma has paused archive creation for scheduled maintenance. Please try again shortly.", })) # Runs validation (exception thrown if invalid), sets properties and saves the object bundle = super(LinkResource, self).obj_create(bundle, created_by=bundle.request.user) asset = Asset(link=bundle.obj) uploaded_file = bundle.data.get('file') if uploaded_file: # normalize file name to upload.jpg, upload.png, upload.gif, or upload.pdf mime_type = get_mime_type(uploaded_file.name) file_name = 'upload.%s' % mime_type_lookup[mime_type]['new_extension'] file_path = os.path.join(asset.base_storage_path, file_name) uploaded_file.file.seek(0) file_name = default_storage.store_file(uploaded_file, file_path) if mime_type == 'application/pdf': asset.pdf_capture = file_name else: asset.image_capture = file_name asset.user_upload = True asset.user_upload_file_name = uploaded_file.name asset.save() else: asset.image_capture = Asset.CAPTURE_STATUS_PENDING # If it appears as if we're trying to archive a PDF, only run our PDF retrieval tool if asset.link.media_type == 'pdf': asset.pdf_capture = Asset.CAPTURE_STATUS_PENDING task = get_pdf else: # else, it's not a PDF. Let's try our best to retrieve what we can asset.warc_capture = Asset.CAPTURE_STATUS_PENDING task = proxy_capture asset.save() run_task(task.s(asset.link.guid, asset.link.submitted_url, asset.base_storage_path, bundle.request.META.get('HTTP_USER_AGENT', ''))) return bundle
def get_pdf(self, link_guid, target_url, base_storage_path, user_agent): """ Download a PDF from the network This function is executed via an asynchronous Celery call """ # basic setup asset_query = get_asset_query(link_guid) pdf_name = 'cap.pdf' pdf_path = os.path.join(base_storage_path, pdf_name) # Get the PDF from the network pdf_request = requests.get(target_url, stream = True, verify=False, headers={'User-Agent': user_agent}) # write PDF out to a temp file temp = tempfile.NamedTemporaryFile() for chunk in pdf_request.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks temp.write(chunk) temp.flush() # Limit our filesize if temp.tell() > settings.MAX_ARCHIVE_FILE_SIZE: logger.info("PDF capture too big, %s" % target_url) asset_query.update(pdf_capture='failed', image_capture='failed') return # store temp file temp.seek(0) pdf_name = default_storage.store_file(temp, pdf_path) asset_query.update(pdf_capture=pdf_name) # Get first page of the PDF and created an image from it # Save it to disk as our image capture (likely a temporary measure) with Image(filename=temp.name) as img: image_name = 'cap.png' image_path = os.path.join(base_storage_path, image_name) default_storage.store_data_to_file(img.make_blob('png'), image_path, overwrite=True) asset_query.update(image_capture=image_name)
def upload_file(request): if request.method == 'POST': form = UploadFileForm(request.POST, request.FILES) if form.is_valid(): mime = MimeTypes() uploaded_file = request.FILES['file'] mime_type = mime.guess_type(uploaded_file.name) # Get mime type string from tuple if mime_type[0]: mime_type = mime_type[0] else: return HttpResponseBadRequest(json.dumps({'status':'failed', 'reason':'Invalid file.'}), 'application/json') if validate_upload_file(uploaded_file, mime_type) and uploaded_file.size <= settings.MAX_ARCHIVE_FILE_SIZE: link = Link(submitted_url=form.cleaned_data['url'], submitted_title=form.cleaned_data['title'], created_by = request.user) link.save() asset = Asset(link=link) file_name = 'cap' + mime.guess_extension(mime_type) file_path = os.path.join(asset.base_storage_path, file_name) uploaded_file.file.seek(0) file_name = default_storage.store_file(uploaded_file, file_path) if mime_type == 'application/pdf': asset.pdf_capture = file_name else: asset.image_capture = file_name asset.save() response_object = {'status':'success', 'linky_id':link.guid, 'linky_hash':link.guid} return HttpResponse(json.dumps(response_object), 'application/json', 201) # '201 Created' status else: return HttpResponseBadRequest(json.dumps({'status':'failed', 'reason':'Invalid file.'}), 'application/json') else: return HttpResponseBadRequest(json.dumps({'status':'failed', 'reason':'Missing file.'}), 'application/json') return HttpResponseBadRequest(json.dumps({'status':'failed', 'reason':'No file submitted.'}), 'application/json')
def upload_file(request): if request.method == 'POST': form = UploadFileForm(request.POST, request.FILES) if form.is_valid(): mime = MimeTypes() uploaded_file = request.FILES['file'] mime_type = mime.guess_type(uploaded_file.name) # Get mime type string from tuple if mime_type[0]: mime_type = mime_type[0] else: return HttpResponseBadRequest( json.dumps({ 'status': 'failed', 'reason': 'Invalid file.' }), 'application/json') if validate_upload_file( uploaded_file, mime_type ) and uploaded_file.size <= settings.MAX_ARCHIVE_FILE_SIZE: link = Link(submitted_url=form.cleaned_data['url'], submitted_title=form.cleaned_data['title'], created_by=request.user) link.save() asset = Asset(link=link) file_name = 'cap' + mime.guess_extension(mime_type) file_path = os.path.join(asset.base_storage_path, file_name) uploaded_file.file.seek(0) file_name = default_storage.store_file(uploaded_file, file_path) if mime_type == 'application/pdf': asset.pdf_capture = file_name else: asset.image_capture = file_name asset.save() response_object = { 'status': 'success', 'linky_id': link.guid, 'linky_hash': link.guid } return HttpResponse(json.dumps(response_object), 'application/json', 201) # '201 Created' status else: return HttpResponseBadRequest( json.dumps({ 'status': 'failed', 'reason': 'Invalid file.' }), 'application/json') else: return HttpResponseBadRequest( json.dumps({ 'status': 'failed', 'reason': 'Missing file.' }), 'application/json') return HttpResponseBadRequest( json.dumps({ 'status': 'failed', 'reason': 'No file submitted.' }), 'application/json')
def proxy_capture(self, link_guid, target_url, base_storage_path, user_agent=''): """ start warcprox process. Warcprox is a MITM proxy server and needs to be running before, during and after phantomjs gets a screenshot. Create an image from the supplied URL, write it to disk and update our asset model with the path. The heavy lifting is done by PhantomJS, our headless browser. This whole function runs with the local dir set to a temp dir by run_in_tempdir(). So we can use local paths for temp files, and they'll just disappear when the function exits. TODO: This function is probably inefficient in saving to the database after each change to asset/link. """ # basic setup asset = Asset.objects.get(link_id=link_guid) link = asset.link image_name = 'cap.png' warc_name = 'archive.warc.gz' image_path = os.path.join(base_storage_path, image_name) warc_path = os.path.join(base_storage_path, warc_name) print "%s: Fetching %s" % (link_guid, target_url) # suppress verbose warcprox logs logging.disable(logging.INFO) # Set up an exception we can trigger to halt capture and release all the resources involved. class HaltCaptureException(Exception): pass meta_thread = browser = robots_txt_thread = warcprox_controller = warcprox_thread = favicon_thread = None have_warc = False try: # create a request handler class that counts unique requests and responses unique_requests = set() unique_responses = set() count_lock = threading.Lock() class CountingRequestHandler(warcprox.WarcProxyHandler): def _proxy_request(self): with count_lock: unique_requests.add(self.url) warcprox.WarcProxyHandler._proxy_request(self) with count_lock: unique_responses.add(self.url) # connect warcprox to an open port warcprox_port = 27500 recorded_url_queue = warcprox.queue.Queue() fake_cert_authority = warcprox.CertificateAuthority() for i in xrange(500): try: proxy = warcprox.WarcProxy( server_address=("127.0.0.1", warcprox_port), ca=fake_cert_authority, recorded_url_q=recorded_url_queue, req_handler_class=CountingRequestHandler) break except socket_error as e: if e.errno != errno.EADDRINUSE: raise warcprox_port += 1 else: raise self.retry( exc=Exception("WarcProx couldn't find an open port.")) proxy_address = "127.0.0.1:%s" % warcprox_port # set up requests getter for one-off requests outside of selenium parsed_target_url = urlparse.urlparse(target_url) target_url_base = parsed_target_url.scheme + '://' + parsed_target_url.netloc + '/' def proxied_get_request(url): return requests.get( url, headers={'User-Agent': user_agent}, proxies={parsed_target_url.scheme: 'http://' + proxy_address}, cert=fake_cert_authority.ca_file) # start warcprox in the background warc_writer = warcprox.WarcWriterThread( recorded_url_q=recorded_url_queue, gzip=True, port=warcprox_port) warcprox_controller = warcprox.WarcproxController(proxy, warc_writer) warcprox_thread = threading.Thread( target=warcprox_controller.run_until_shutdown, name="warcprox", args=()) warcprox_thread.start() # print "WarcProx opened." # fetch robots.txt in the background def robots_txt_thread(): #print "Fetching robots.txt ..." robots_txt_location = target_url_base + 'robots.txt' try: robots_txt_response = proxied_get_request(robots_txt_location) assert robots_txt_response.ok except (requests.ConnectionError, requests.Timeout, AssertionError): #print "Couldn't reach robots.txt" return # We only want to respect robots.txt if Perma is specifically asked not to archive (we're not a crawler) if 'Perma' in robots_txt_response.content: # We found Perma specifically mentioned rp = robotparser.RobotFileParser() rp.parse([ line.strip() for line in robots_txt_response.content.split('\n') ]) if not rp.can_fetch('Perma', target_url): save_fields(link, dark_archived_robots_txt_blocked=True) # print "Robots.txt fetched." robots_txt_thread = threading.Thread(target=robots_txt_thread, name="robots") robots_txt_thread.start() # fetch page in the background # print "Fetching url." browser = get_browser(user_agent, proxy_address, fake_cert_authority.ca_file) browser.set_window_size(1024, 800) page_load_thread = threading.Thread( target=browser.get, args=(target_url, )) # returns after onload page_load_thread.start() page_load_thread.join(PAGE_LOAD_TIMEOUT) if page_load_thread.is_alive(): # print "Waited 60 seconds for onLoad event -- giving up." if not unique_responses: # if nothing at all has loaded yet, give up on the capture save_fields(asset, warc_capture='failed', image_capture='failed') raise HaltCaptureException # print "Finished fetching url." # get favicon favicons = browser.find_elements_by_xpath( '//link[@rel="icon" or @rel="shortcut icon"]') favicons = [i for i in favicons if i.get_attribute('href')] if favicons: favicon_url = urlparse.urljoin(browser.current_url, favicons[0].get_attribute('href')) favicon_extension = favicon_url.rsplit('.', 1)[-1] if not favicon_extension in ['ico', 'gif', 'jpg', 'jpeg', 'png']: favicon_url = None else: favicon_url = urlparse.urljoin(browser.current_url, '/favicon.ico') if favicon_url: # try to fetch favicon in background def favicon_thread(): # print "Fetching favicon from %s ..." % favicon_url try: favicon_response = proxied_get_request(favicon_url) assert favicon_response.ok except (requests.ConnectionError, requests.Timeout, AssertionError): # print "Couldn't get favicon" return favicon_file = favicon_url.rsplit('/', 1)[-1] default_storage.store_data_to_file(favicon_response.content, os.path.join( base_storage_path, favicon_file), overwrite=True) save_fields(asset, favicon=favicon_file) print "Saved favicon as %s" % favicon_file favicon_thread = threading.Thread(target=favicon_thread, name="favicon") favicon_thread.start() # get page title # print "Getting title." if browser.title: save_fields(link, submitted_title=browser.title) # check meta tags # (run this in a thread and give it long enough to find the tags, but then let other stuff proceed) # print "Checking meta tags." def meta_thread(): # get all meta tags meta_tags = browser.find_elements_by_tag_name('meta') # first look for <meta name='perma'> meta_tag = next((tag for tag in meta_tags if tag.get_attribute('name').lower() == 'perma'), None) # else look for <meta name='robots'> if not meta_tag: meta_tag = next( (tag for tag in meta_tags if tag.get_attribute('name').lower() == 'robots'), None) # if we found a relevant meta tag, check for noarchive if meta_tag and 'noarchive' in meta_tag.get_attribute( "content").lower(): save_fields(link, dark_archived_robots_txt_blocked=True) # print "Meta found, darchiving" meta_thread = threading.Thread(target=meta_thread) meta_thread.start() meta_thread.join(ELEMENT_DISCOVERY_TIMEOUT * 2) # scroll to bottom of page and back up, in case that prompts anything else to load try: browser.execute_script( "window.scrollTo(0, document.body.scrollHeight);") browser.execute_script("window.scrollTo(0, 0);") except WebDriverException: pass # get page size to decide whether to take a screenshot capture_screenshot = False try: root_element = browser.find_element_by_tag_name('body') except NoSuchElementException: try: root_element = browser.find_element_by_tag_name('frameset') except NoSuchElementException: root_element = None if root_element: page_size = root_element.size pixel_count = page_size['width'] * page_size['height'] capture_screenshot = pixel_count < settings.MAX_IMAGE_SIZE if not capture_screenshot: # print "Not saving screenshots! Page size is %s pixels." % pixel_count save_fields(asset, image_capture='failed') # save preliminary screenshot immediately, and an updated version later # (we want to return results quickly, but also give javascript time to render final results) if capture_screenshot: # print "Saving first screenshot." save_screenshot(browser, image_path) save_fields(asset, image_capture=image_name) # make sure all requests are finished # print "Waiting for post-load requests." start_time = time.time() time.sleep(min(AFTER_LOAD_TIMEOUT, 5)) while len(unique_responses) < len(unique_requests): # print "%s/%s finished" % (len(unique_responses), len(unique_requests)) if time.time() - start_time > AFTER_LOAD_TIMEOUT: # print "Waited %s seconds to finish post-load requests -- giving up." % AFTER_LOAD_TIMEOUT break time.sleep(.5) # take second screenshot after all requests done if capture_screenshot: # print "Taking second screenshot." save_screenshot(browser, image_path) have_warc = True except HaltCaptureException: pass finally: # teardown (have to do this before save to make sure WARC is done writing): # print "Shutting down browser and proxies." if browser: browser.quit() # shut down phantomjs # This can be removed when this bugfix ships in selenium: # https://code.google.com/p/selenium/issues/detail?id=8498 browser.service.process.stdin.close() if meta_thread: meta_thread.join() # wait until meta thread is done if robots_txt_thread: robots_txt_thread.join() # wait until robots thread is done if favicon_thread: favicon_thread.join() # wait until favicon thread is done if warcprox_controller: warcprox_controller.stop.set( ) # send signal to shut down warc thread if warcprox_thread: warcprox_thread.join( ) # wait until warcprox thread is done writing out warc # un-suppress logging logging.disable(logging.NOTSET) # save generated warc file if have_warc: # print "Saving WARC." try: temp_warc_path = os.path.join(warc_writer.directory, warc_writer._f_finalname) with open(temp_warc_path, 'rb') as warc_file: warc_name = default_storage.store_file(warc_file, warc_path) save_fields(asset, warc_capture=warc_name) # print "Writing CDX lines to the DB" CDXLine.objects.create_all_from_asset(asset) except Exception as e: logger.info("Web Archive File creation failed for %s: %s" % (target_url, e)) save_fields(asset, warc_capture='failed')
def safe_delete_warc(self): old_name = self.warc_storage_file() if default_storage.exists(old_name): new_name = old_name.replace('.warc.gz', '_replaced.warc.gz') default_storage.store_file(default_storage.open(old_name), new_name) default_storage.delete(old_name)
def update_perma(link_guid): """ Update the vested/darchived status of a perma link, and download the assets if necessary """ # N.B. This function has two instances of downloading stuff from # the root server using a scheme that looks something like # settings.SERVER + reverse("url_pattern") # This is nice because it means we don't have to repeat our URL # patterns from urls.py, but it hardcodes the fact that the root # server is another Perma instance. It's unclear to me which is a # better fact to abstract, but this is easier for now. ## First, let's get the metadata for this link. The metadata ## contains information about where we should place the assets (if ## we decide that we need them). This is also a fast check to make ## sure the link GUID is actually real. metadata_server = settings.UPSTREAM_SERVER['address'] metadata_url = metadata_server + reverse("service_link_status", args=(link_guid,)) metadata = requests.get( metadata_url, headers=settings.UPSTREAM_SERVER.get('headers', {}) ).json() ## Next, let's see if we need to get the assets. If we have the ## Link object for this GUID, we're going to assume we already ## have what we need. It would make a little more sense to use the ## Asset object here instead, but we're definitely going to need ## to do stuff to the Link object so we might as well get that ## instead. In practice they should be ~one to one. try: link = Link.objects.get(guid=link_guid) except Link.DoesNotExist: ## We need to download the assets. We can download an archive ## from the assets server. assets_server = settings.UPSTREAM_SERVER['address'] assets_url = assets_server + reverse("mirroring:link_assets", args=(link_guid,)) # Temp paths can be relative because we're in run_in_tempdir() temp_zip_path = 'temp.zip' # Save remote zip file to disk, using streaming to avoid keeping large files in RAM. request = requests.get( assets_url, headers=settings.UPSTREAM_SERVER.get('headers', {}), stream=True) with open(temp_zip_path, 'wb') as f: for chunk in request.iter_content(1024): f.write(chunk) ## Extract the archive and change into the extracted folder. with zipfile.ZipFile(temp_zip_path, "r") as zipfh: #assets_path = os.path.dirname(os.path.join(settings.MEDIA_ROOT, metadata["path"])) zipfh.extractall() # creates folder named [guid] in current temp dir temp_extracted_path = os.path.basename(metadata['path']) # e.g. "1234-ABCD" # Save all extracted files to default_storage, using the path in metadata. for root, dirs, files in os.walk(temp_extracted_path): for file in files: source_file_path = os.path.join(root, file) # e.g. "1234-ABCD/cap.png" dest_file_path = os.path.join(os.path.dirname(metadata['path']), source_file_path) # e.g. 2014/6/10/18/37/1234-ABCD/cap.png with open(source_file_path, 'rb') as source_file: default_storage.store_file(source_file, dest_file_path) ## We can now get some additional metadata that we'll need to ## create the Link object. with open(os.path.join(temp_extracted_path, "metadata.json"), "r") as fh: link_metadata = json.load(fh) ## We now have everything we need to initialize the Link object. link = Link(guid=link_guid) link.submitted_url = link_metadata["submitted_url"] link.submitted_title = link_metadata["submitted_title"] link.created_by = None # XXX maybe we should do something with FakeUser here link.save(pregenerated_guid=True) # We need to save this so that we can create an Asset object # This is a stupid hack to overcome the fact that the Link has # auto_now_add=True, so it's always going to be saved to the # current time on first creation. link.creation_timestamp = unserialize_datetime(link_metadata["creation_timestamp"]) link.save() ## Lastly, let's create an Asset object for this Link. asset = Asset(link=link) asset.base_storage_path = metadata["path"] asset.image_capture = metadata["image_capture"] asset.warc_capture = metadata["source_capture"] asset.pdf_capture = metadata["pdf_capture"] asset.text_capture = metadata["text_capture"] asset.save() ## We can now add some of the data we got from the metadata to the Link object link.dark_archived = metadata["dark_archived"] link.vested = metadata["vested"] link.save() # If we have sub-mirrors, poke them to get a copy from us. if settings.DOWNSTREAM_SERVERS: run_task(poke_mirrors, link_guid=link_guid)
def proxy_capture(self, link_guid, target_url, base_storage_path, user_agent=''): """ start warcprox process. Warcprox is a MITM proxy server and needs to be running before, during and after phantomjs gets a screenshot. Create an image from the supplied URL, write it to disk and update our asset model with the path. The heavy lifting is done by PhantomJS, our headless browser. This whole function runs with the local dir set to a temp dir by run_in_tempdir(). So we can use local paths for temp files, and they'll just disappear when the function exits. TODO: This function is probably inefficient in saving to the database after each change to asset/link. """ # basic setup asset = Asset.objects.get(link_id=link_guid) link = asset.link image_name = 'cap.png' warc_name = 'archive.warc.gz' image_path = os.path.join(base_storage_path, image_name) warc_path = os.path.join(base_storage_path, warc_name) print "%s: Fetching %s" % (link_guid, target_url) # suppress verbose warcprox logs logging.disable(logging.INFO) # Set up an exception we can trigger to halt capture and release all the resources involved. class HaltCaptureException(Exception): pass meta_thread = browser = robots_txt_thread = warcprox_controller = warcprox_thread = favicon_thread = None have_warc = False try: # create a request handler class that counts unique requests and responses unique_requests = set() unique_responses = set() count_lock = threading.Lock() class CountingRequestHandler(warcprox.WarcProxyHandler): def _proxy_request(self): with count_lock: unique_requests.add(self.url) warcprox.WarcProxyHandler._proxy_request(self) with count_lock: unique_responses.add(self.url) # connect warcprox to an open port warcprox_port = 27500 recorded_url_queue = warcprox.queue.Queue() fake_cert_authority = warcprox.CertificateAuthority() for i in xrange(500): try: proxy = warcprox.WarcProxy( server_address=("127.0.0.1", warcprox_port), ca=fake_cert_authority, recorded_url_q=recorded_url_queue, req_handler_class=CountingRequestHandler ) break except socket_error as e: if e.errno != errno.EADDRINUSE: raise warcprox_port += 1 else: raise self.retry(exc=Exception("WarcProx couldn't find an open port.")) proxy_address = "127.0.0.1:%s" % warcprox_port # set up requests getter for one-off requests outside of selenium parsed_target_url = urlparse.urlparse(target_url) target_url_base = parsed_target_url.scheme + '://' + parsed_target_url.netloc + '/' def proxied_get_request(url): return requests.get(url, headers={'User-Agent': user_agent}, proxies={parsed_target_url.scheme: 'http://' + proxy_address}, cert=fake_cert_authority.ca_file) # start warcprox in the background warc_writer = warcprox.WarcWriterThread(recorded_url_q=recorded_url_queue, gzip=True, port=warcprox_port) warcprox_controller = warcprox.WarcproxController(proxy, warc_writer) warcprox_thread = threading.Thread(target=warcprox_controller.run_until_shutdown, name="warcprox", args=()) warcprox_thread.start() # print "WarcProx opened." # fetch robots.txt in the background def robots_txt_thread(): #print "Fetching robots.txt ..." robots_txt_location = target_url_base + 'robots.txt' try: robots_txt_response = proxied_get_request(robots_txt_location) assert robots_txt_response.ok except (requests.ConnectionError, requests.Timeout, AssertionError): #print "Couldn't reach robots.txt" return # We only want to respect robots.txt if Perma is specifically asked not to archive (we're not a crawler) if 'Perma' in robots_txt_response.content: # We found Perma specifically mentioned rp = robotparser.RobotFileParser() rp.parse([line.strip() for line in robots_txt_response.content.split('\n')]) if not rp.can_fetch('Perma', target_url): save_fields(link, dark_archived_robots_txt_blocked=True) # print "Robots.txt fetched." robots_txt_thread = threading.Thread(target=robots_txt_thread, name="robots") robots_txt_thread.start() # fetch page in the background # print "Fetching url." browser = get_browser(user_agent, proxy_address, fake_cert_authority.ca_file) browser.set_window_size(1024, 800) page_load_thread = threading.Thread(target=browser.get, args=(target_url,)) # returns after onload page_load_thread.start() page_load_thread.join(PAGE_LOAD_TIMEOUT) if page_load_thread.is_alive(): # print "Waited 60 seconds for onLoad event -- giving up." if not unique_responses: # if nothing at all has loaded yet, give up on the capture save_fields(asset, warc_capture='failed', image_capture='failed') raise HaltCaptureException # print "Finished fetching url." # get favicon favicons = browser.find_elements_by_xpath('//link[@rel="icon" or @rel="shortcut icon"]') favicons = [i for i in favicons if i.get_attribute('href')] if favicons: favicon_url = urlparse.urljoin(browser.current_url, favicons[0].get_attribute('href')) favicon_extension = favicon_url.rsplit('.',1)[-1] if not favicon_extension in ['ico', 'gif', 'jpg', 'jpeg', 'png']: favicon_url = None else: favicon_url = urlparse.urljoin(browser.current_url, '/favicon.ico') if favicon_url: # try to fetch favicon in background def favicon_thread(): # print "Fetching favicon from %s ..." % favicon_url try: favicon_response = proxied_get_request(favicon_url) assert favicon_response.ok except (requests.ConnectionError, requests.Timeout, AssertionError): # print "Couldn't get favicon" return favicon_file = favicon_url.rsplit('/',1)[-1] default_storage.store_data_to_file(favicon_response.content, os.path.join(base_storage_path, favicon_file), overwrite=True) save_fields(asset, favicon=favicon_file) print "Saved favicon as %s" % favicon_file favicon_thread = threading.Thread(target=favicon_thread, name="favicon") favicon_thread.start() # get page title # print "Getting title." if browser.title: save_fields(link, submitted_title=browser.title) # check meta tags # (run this in a thread and give it long enough to find the tags, but then let other stuff proceed) # print "Checking meta tags." def meta_thread(): # get all meta tags meta_tags = browser.find_elements_by_tag_name('meta') # first look for <meta name='perma'> meta_tag = next((tag for tag in meta_tags if tag.get_attribute('name').lower()=='perma'), None) # else look for <meta name='robots'> if not meta_tag: meta_tag = next((tag for tag in meta_tags if tag.get_attribute('name').lower() == 'robots'), None) # if we found a relevant meta tag, check for noarchive if meta_tag and 'noarchive' in meta_tag.get_attribute("content").lower(): save_fields(link, dark_archived_robots_txt_blocked=True) # print "Meta found, darchiving" meta_thread = threading.Thread(target=meta_thread) meta_thread.start() meta_thread.join(ELEMENT_DISCOVERY_TIMEOUT*2) # scroll to bottom of page and back up, in case that prompts anything else to load try: browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") browser.execute_script("window.scrollTo(0, 0);") except WebDriverException: pass # get page size to decide whether to take a screenshot capture_screenshot = False try: root_element = browser.find_element_by_tag_name('body') except NoSuchElementException: try: root_element = browser.find_element_by_tag_name('frameset') except NoSuchElementException: root_element = None if root_element: page_size = root_element.size pixel_count = page_size['width']*page_size['height'] capture_screenshot = pixel_count < settings.MAX_IMAGE_SIZE if not capture_screenshot: # print "Not saving screenshots! Page size is %s pixels." % pixel_count save_fields(asset, image_capture='failed') # save preliminary screenshot immediately, and an updated version later # (we want to return results quickly, but also give javascript time to render final results) if capture_screenshot: # print "Saving first screenshot." save_screenshot(browser, image_path) save_fields(asset, image_capture=image_name) # make sure all requests are finished # print "Waiting for post-load requests." start_time = time.time() time.sleep(min(AFTER_LOAD_TIMEOUT, 5)) while len(unique_responses) < len(unique_requests): # print "%s/%s finished" % (len(unique_responses), len(unique_requests)) if time.time() - start_time > AFTER_LOAD_TIMEOUT: # print "Waited %s seconds to finish post-load requests -- giving up." % AFTER_LOAD_TIMEOUT break time.sleep(.5) # take second screenshot after all requests done if capture_screenshot: # print "Taking second screenshot." save_screenshot(browser, image_path) have_warc = True except HaltCaptureException: pass finally: # teardown (have to do this before save to make sure WARC is done writing): # print "Shutting down browser and proxies." if browser: browser.quit() # shut down phantomjs # This can be removed when this bugfix ships in selenium: # https://code.google.com/p/selenium/issues/detail?id=8498 browser.service.process.stdin.close() if meta_thread: meta_thread.join() # wait until meta thread is done if robots_txt_thread: robots_txt_thread.join() # wait until robots thread is done if favicon_thread: favicon_thread.join() # wait until favicon thread is done if warcprox_controller: warcprox_controller.stop.set() # send signal to shut down warc thread if warcprox_thread: warcprox_thread.join() # wait until warcprox thread is done writing out warc # un-suppress logging logging.disable(logging.NOTSET) # save generated warc file if have_warc: # print "Saving WARC." try: temp_warc_path = os.path.join(warc_writer.directory, warc_writer._f_finalname) with open(temp_warc_path, 'rb') as warc_file: warc_name = default_storage.store_file(warc_file, warc_path) save_fields(asset, warc_capture=warc_name) # print "Writing CDX lines to the DB" CDXLine.objects.create_all_from_asset(asset) except Exception as e: logger.info("Web Archive File creation failed for %s: %s" % (target_url, e)) save_fields(asset, warc_capture='failed')
def proxy_capture(self, link_guid, target_url, base_storage_path, user_agent=''): """ start warcprox process. Warcprox is a MITM proxy server and needs to be running before, during and after phantomjs gets a screenshot. Create an image from the supplied URL, write it to disk and update our asset model with the path. The heavy lifting is done by PhantomJS, our headless browser. This whole function runs with the local dir set to a temp dir by run_in_tempdir(). So we can use local paths for temp files, and they'll just disappear when the function exits. """ # basic setup asset_query = get_asset_query(link_guid) link_query = get_link_query(link_guid) image_name = 'cap.png' warc_name = 'archive.warc.gz' image_path = os.path.join(base_storage_path, image_name) warc_path = os.path.join(base_storage_path, warc_name) print "%s: Fetching %s" % (link_guid, target_url) # create a request handler class that counts unique requests and responses #global unique_requests, unique_responses unique_requests = set() unique_responses = set() count_lock = threading.Lock() class CountingRequestHandler(warcprox.WarcProxyHandler): def _proxy_request(self): #global unique_requests, unique_responses with count_lock: unique_requests.add(self.url) warcprox.WarcProxyHandler._proxy_request(self) with count_lock: unique_responses.add(self.url) # connect warcprox to an open port warcprox_port = 27500 recorded_url_queue = warcprox.queue.Queue() fake_cert_authority = warcprox.CertificateAuthority() for i in xrange(500): try: proxy = warcprox.WarcProxy( server_address=("127.0.0.1", warcprox_port), ca=fake_cert_authority, recorded_url_q=recorded_url_queue, req_handler_class=CountingRequestHandler ) break except socket_error as e: if e.errno != errno.EADDRINUSE: raise warcprox_port += 1 else: raise self.retry(exc=Exception("WarcProx couldn't find an open port.")) proxy_address = "127.0.0.1:%s" % warcprox_port # start warcprox in the background warc_writer = warcprox.WarcWriterThread(recorded_url_q=recorded_url_queue, gzip=True, port=warcprox_port) warcprox_controller = warcprox.WarcproxController(proxy, warc_writer) warcprox_thread = threading.Thread(target=warcprox_controller.run_until_shutdown, name="warcprox", args=()) warcprox_thread.start() print "WarcProx opened." # fetch robots.txt in the background def robots_txt_thread(): print "Fetching robots.txt ..." parsed_url = urlparse.urlparse(target_url) robots_txt_location = parsed_url.scheme + '://' + parsed_url.netloc + '/robots.txt' try: robots_txt_response = requests.get(robots_txt_location, headers={'User-Agent': user_agent}, proxies={parsed_url.scheme:'http://'+proxy_address}, cert=fake_cert_authority.ca_file) except (requests.ConnectionError, requests.Timeout): print "Couldn't reach robots.txt" return if not robots_txt_response.ok: print "No robots.txt found" # We only want to respect robots.txt if Perma is specifically asked not to archive (we're not a crawler) if 'Perma' in robots_txt_response.content: # We found Perma specifically mentioned rp = robotparser.RobotFileParser() rp.parse([line.strip() for line in robots_txt_response.content.split('\n')]) if not rp.can_fetch('Perma', target_url): link_query.update(dark_archived_robots_txt_blocked=True) print "Robots.txt fetched." robots_txt_thread = threading.Thread(target=robots_txt_thread, name="robots") robots_txt_thread.start() # fetch page in the background # (we'll give print "Fetching url." browser = get_browser(user_agent, proxy_address, fake_cert_authority.ca_file) browser.set_window_size(1024, 800) page_load_thread = threading.Thread(target=browser.get, args=(target_url,)) # returns after onload page_load_thread.start() page_load_thread.join(PAGE_LOAD_TIMEOUT) if page_load_thread.is_alive(): print "Waited 60 seconds for onLoad event -- giving up." if not unique_responses: # if nothing at all has loaded yet, give up on the capture asset_query.update(warc_capture='failed', image_capture='failed') browser.quit() # shut down phantomjs robots_txt_thread.join() # wait until robots thread is done warcprox_controller.stop.set() # send signal to shut down warc thread warcprox_thread.join() return print "Finished fetching url." # get page title print "Getting title." if browser.title: link_query.update(submitted_title=browser.title) # check meta tags # (run this in a thread and give it long enough to find the tags, but then let other stuff proceed) print "Checking meta tags." def meta_thread(): # get all meta tags meta_tags = browser.find_elements_by_tag_name('meta') # first look for <meta name='perma'> meta_tag = next((tag for tag in meta_tags if tag.get_attribute('name').lower()=='perma'), None) # else look for <meta name='robots'> if not meta_tag: meta_tag = next((tag for tag in meta_tags if tag.get_attribute('name').lower() == 'robots'), None) # if we found a relevant meta tag, check for noarchive if meta_tag and 'noarchive' in meta_tag.get_attribute("content").lower(): link_query.update(dark_archived_robots_txt_blocked=True) print "Meta found, darchiving" else: print "Meta not found." meta_thread = threading.Thread(target=meta_thread) meta_thread.start() meta_thread.join(ELEMENT_DISCOVERY_TIMEOUT*2) # save preliminary screenshot immediately, and an updated version later # (we want to return results quickly, but also give javascript time to render final results) print "Saving first screenshot." save_screenshot(browser, image_path) asset_query.update(image_capture=image_name) # make sure all requests are finished print "Waiting for post-load requests." start_time = time.time() time.sleep(min(AFTER_LOAD_TIMEOUT, 5)) while len(unique_responses) < len(unique_requests): print "%s/%s finished" % (len(unique_responses), len(unique_requests)) if time.time() - start_time > AFTER_LOAD_TIMEOUT: print "Waited %s seconds to finish post-load requests -- giving up." % AFTER_LOAD_TIMEOUT break time.sleep(.5) # take second screenshot after all requests done print "Taking second screenshot." save_screenshot(browser, image_path) # teardown: print "Shutting down browser and proxies." browser.quit() # shut down phantomjs robots_txt_thread.join() # wait until robots thread is done meta_thread.join() # wait until meta thread is done warcprox_controller.stop.set() # send signal to shut down warc thread warcprox_thread.join() # wait until warcprox thread is done writing out warc print "Saving WARC." # save generated warc file try: temp_warc_path = os.path.join(warc_writer.directory, warc_writer._f_finalname) with open(temp_warc_path, 'rb') as warc_file: warc_name = default_storage.store_file(warc_file, warc_path) asset_query.update(warc_capture=warc_name) except Exception as e: logger.info("Web Archive File creation failed for %s: %s" % (target_url, e)) asset_query.update(warc_capture='failed') print "%s capture done." % link_guid