def post(self, request, format=None): """ Create new link. """ data = request.data capture_job = CaptureJob(human=request.data.get('human', False), submitted_url=request.data.get('url', ''), created_by=request.user) if settings.ENABLE_BATCH_LINKS: # Batch is set directly on the request object by the LinkBatch api, # to prevent abuse of this feature by those POSTing directly to this route. if getattr(request, 'batch', None): capture_job.link_batch = LinkBatch.objects.get( id=request.batch) capture_job.save() # Set target folder, in order of preference: # - 'folder' key in data # - parent folder, if posting to /folders/:parent_id/archives # - user's personal folder try: folder = self.get_folder_from_request( request) or request.parent or request.user.root_folder except ValidationError as e: raise_invalid_capture_job(capture_job, e.detail) # Make sure a limited user has links left to create if not folder.organization: if not request.user.link_creation_allowed(): if request.user.nonpaying: raise_invalid_capture_job( capture_job, "You've already reached your limit.") error = "Perma.cc cannot presently make additional Perma Links on your behalf. Visit your subscription settings page for more information." raise_invalid_capture_job(capture_job, error) else: registrar = folder.organization.registrar if not registrar.link_creation_allowed(): error = 'Perma.cc cannot presently make links on behalf of {}. '.format( registrar.name) if request.user.registrar: contact = 'Visit your settings for subscription information.' else: registrar_users = [ user.email for user in registrar.active_registrar_users() ] contact = 'For assistance with your subscription, contact: {}.'.format( ", ".join(registrar_users)) raise_invalid_capture_job(capture_job, error + contact) serializer = self.serializer_class(data=data, context={'request': request}) if serializer.is_valid(): link = serializer.save(created_by=request.user) # put link in folder and handle Org settings based on folder if folder.organization and folder.organization.default_to_private: link.is_private = True link.save() link.move_to_folder_for_user( folder, request.user) # also sets link.organization # handle uploaded file uploaded_file = request.data.get('file') if uploaded_file: link.write_uploaded_file(uploaded_file) link.warc_size = default_storage.size(link.warc_storage_file()) link.save() # handle submitted url else: # create primary capture placeholder Capture( link=link, role='primary', status='pending', record_type='response', url=link.submitted_url, ).save() # create screenshot placeholder Capture( link=link, role='screenshot', status='pending', record_type='resource', url="file:///%s/cap.png" % link.guid, content_type='image/png', ).save() # kick off capture tasks -- no need for guid since it'll work through the queue capture_job.status = 'pending' capture_job.link = link capture_job.save(update_fields=['status', 'link']) run_task(run_next_capture.s()) return Response(serializer.data, status=status.HTTP_201_CREATED) raise_invalid_capture_job(capture_job, serializer.errors)
def obj_create(self, bundle, **kwargs): # We've received a request to archive a URL. That process is managed here. # We create a new entry in our datastore and pass the work off to our indexing # workers. They do their thing, updating the model as they go. When we get some minimum # set of results we can present the user (a guid for the link), we respond back. if settings.READ_ONLY_MODE: raise ImmediateHttpResponse(response=self.error_response( bundle.request, { 'archives': { '__all__': "Perma has paused archive creation for scheduled maintenance. Please try again shortly." }, 'reason': "Perma has paused archive creation for scheduled maintenance. Please try again shortly.", })) # Runs validation (exception thrown if invalid), sets properties and saves the object if not bundle.data.get('replace'): bundle = super(LinkResource, self).obj_create(bundle, created_by=bundle.request.user) link = bundle.obj link.save() # put link in folder and handle Org settings based on folder folder = bundle.data.get('folder') if folder.organization and folder.organization.default_to_private: link.is_private = True link.save() link.move_to_folder_for_user( folder, bundle.request.user) # also sets link.organization uploaded_file = bundle.data.get('file') if uploaded_file: # normalize file name to upload.jpg, upload.png, upload.gif, or upload.pdf mime_type = get_mime_type(uploaded_file.name) file_name = 'upload.%s' % mime_type_lookup[mime_type][ 'new_extension'] base_warc_url = "file:///%s/%s" % (link.guid, file_name) # only append a random number to warc_url if we're replacing a file warc_url = base_warc_url if not bundle.data.get( 'replace') else "%s?version=%s" % ( base_warc_url, str(random.random()).replace('.', '')) capture = Capture(link=link, role='primary', status='success', record_type='resource', user_upload='True', content_type=mime_type, url=warc_url) uploaded_file.file.seek(0) capture.write_warc_resource_record(uploaded_file) capture.save() else: # create primary capture placeholder Capture( link=link, role='primary', status='pending', record_type='response', url=link.submitted_url, ).save() # create screenshot placeholder Capture( link=link, role='screenshot', status='pending', record_type='resource', url="file:///%s/cap.png" % link.guid, content_type='image/png', ).save() # create CaptureJob CaptureJob(link=link, human=bundle.data.get('human', False)).save() # kick off capture tasks -- no need for guid since it'll work through the queue run_task(run_next_capture.s()) return bundle
def obj_create(self, bundle, **kwargs): # We've received a request to archive a URL. That process is managed here. # We create a new entry in our datastore and pass the work off to our indexing # workers. They do their thing, updating the model as they go. When we get some minimum # set of results we can present the user (a guid for the link), we respond back. if settings.READ_ONLY_MODE: raise ImmediateHttpResponse( response=self.error_response( bundle.request, { "archives": { "__all__": "Perma has paused archive creation for scheduled maintenance. Please try again shortly." }, "reason": "Perma has paused archive creation for scheduled maintenance. Please try again shortly.", }, ) ) # Runs validation (exception thrown if invalid), sets properties and saves the object bundle = super(LinkResource, self).obj_create(bundle, created_by=bundle.request.user) link = bundle.obj uploaded_file = bundle.data.get("file") if uploaded_file: # normalize file name to upload.jpg, upload.png, upload.gif, or upload.pdf mime_type = get_mime_type(uploaded_file.name) file_name = "upload.%s" % mime_type_lookup[mime_type]["new_extension"] warc_url = "file:///%s/%s" % (link.guid, file_name) capture = Capture( link=link, role="primary", status="success", record_type="resource", user_upload="True", content_type=mime_type, url=warc_url, ) uploaded_file.file.seek(0) capture.write_warc_resource_record(uploaded_file) capture.save() else: # create primary capture placeholder Capture(link=link, role="primary", status="pending", record_type="response", url=link.submitted_url).save() # create screenshot placeholder Capture( link=link, role="screenshot", status="pending", record_type="resource", url="file:///%s/cap.png" % link.guid, content_type="image/png", ).save() # kick off capture task run_task(proxy_capture.s(link.guid, bundle.request.META.get("HTTP_USER_AGENT", ""))) return bundle
def obj_create(self, bundle, **kwargs): # We've received a request to archive a URL. That process is managed here. # We create a new entry in our datastore and pass the work off to our indexing # workers. They do their thing, updating the model as they go. When we get some minimum # set of results we can present the user (a guid for the link), we respond back. if settings.READ_ONLY_MODE: raise ImmediateHttpResponse(response=self.error_response(bundle.request, { 'archives': {'__all__': "Perma has paused archive creation for scheduled maintenance. Please try again shortly."}, 'reason': "Perma has paused archive creation for scheduled maintenance. Please try again shortly.", })) # Runs validation (exception thrown if invalid), sets properties and saves the object if not bundle.data.get('replace'): bundle = super(LinkResource, self).obj_create(bundle, created_by=bundle.request.user) link = bundle.obj link.save() # put link in folder and handle Org settings based on folder folder = bundle.data.get('folder') if folder.organization and folder.organization.default_to_private: link.is_private = True link.save() link.move_to_folder_for_user(folder, bundle.request.user) # also sets link.organization uploaded_file = bundle.data.get('file') if uploaded_file: # normalize file name to upload.jpg, upload.png, upload.gif, or upload.pdf mime_type = get_mime_type(uploaded_file.name) file_name = 'upload.%s' % mime_type_lookup[mime_type]['new_extension'] base_warc_url = "file:///%s/%s" % (link.guid, file_name) # only append a random number to warc_url if we're replacing a file warc_url = base_warc_url if not bundle.data.get('replace') else "%s?version=%s" % (base_warc_url, str(random.random()).replace('.','')) capture = Capture(link=link, role='primary', status='success', record_type='resource', user_upload='True', content_type=mime_type, url=warc_url) uploaded_file.file.seek(0) capture.write_warc_resource_record(uploaded_file) capture.save() else: # create primary capture placeholder Capture( link=link, role='primary', status='pending', record_type='response', url=link.submitted_url, ).save() # create screenshot placeholder Capture( link=link, role='screenshot', status='pending', record_type='resource', url="file:///%s/cap.png" % link.guid, content_type='image/png', ).save() # create CaptureJob CaptureJob(link=link, human=bundle.data.get('human', False)).save() # kick off capture tasks -- no need for guid since it'll work through the queue run_task(run_next_capture.s()) return bundle
def post(self, request, format=None): """ Create new link. """ data = request.data capture_job = CaptureJob( human=request.data.get('human', False), submitted_url=request.data.get('url', ''), created_by=request.user ) if settings.ENABLE_BATCH_LINKS: # Batch is set directly on the request object by the LinkBatch api, # to prevent abuse of this feature by those POSTing directly to this route. if getattr(request, 'batch', None): capture_job.link_batch = LinkBatch.objects.get(id=request.batch) capture_job.save() # Set target folder, in order of preference: # - 'folder' key in data # - parent folder, if posting to /folders/:parent_id/archives # - user's personal folder try: folder = self.get_folder_from_request(request) or request.parent or request.user.root_folder except ValidationError as e: raise_invalid_capture_job(capture_job, e.detail) # Disallow creation of links in top-level sponsored folder if folder.is_sponsored_root_folder: error = "You can't make links directly in your Sponsored Links folder. Select a folder belonging to a sponsor." raise_invalid_capture_job(capture_job, error) # Make sure a limited user has links left to create if not folder.organization and not folder.sponsored_by: if not request.user.link_creation_allowed(): if request.user.nonpaying: raise_invalid_capture_job(capture_job, "You've already reached your limit.") error = "Perma.cc cannot presently make additional Perma Links on your behalf. Visit your subscription settings page for more information." raise_invalid_capture_job(capture_job, error) else: registrar = folder.sponsored_by if folder.sponsored_by else folder.organization.registrar msg = None if folder.read_only: registrar_users = [user.email for user in registrar.active_registrar_users()] msg = f"Your registrar has made this folder read-only. For assistance, contact: {', '.join(registrar_users)}." if not registrar.link_creation_allowed(): error = 'Perma.cc cannot presently make links on behalf of {}. '.format(registrar.name) if request.user.registrar: contact = 'Visit your settings for subscription information.' else: registrar_users = [user.email for user in registrar.active_registrar_users()] contact = 'For assistance with your subscription, contact: {}.'.format(", ".join(registrar_users)) msg = error + contact if msg: raise_invalid_capture_job(capture_job, msg) serializer = self.serializer_class(data=data, context={'request': request}) if serializer.is_valid(): with transaction.atomic(): # Technique from https://github.com/harvard-lil/capstone/blob/0f7fb80f26e753e36e0c7a6a199b8fdccdd318be/capstone/capapi/serializers.py#L121 # # Fetch the current user data here inside a transaction, using select_for_update # to lock the row so we don't collide with any simultaneous requests user = request.user.__class__.objects.select_for_update().get(pk=request.user.pk) # If this is a Personal Link, and if the user only has bonus links left, decrement bonus links bonus_link = False if not folder.organization and not folder.sponsored_by: links_remaining, _ , bonus_links = user.get_links_remaining() if bonus_links and not links_remaining: # (this works because it's part of the same transaction with the select_for_update -- # we don't have to use the same object) request.user.bonus_links = bonus_links - 1 request.user.save(update_fields=['bonus_links']) bonus_link = True link = serializer.save(created_by=request.user, bonus_link=bonus_link) # put link in folder and handle Org settings based on folder if folder.organization and folder.organization.default_to_private: link.is_private = True link.save() link.move_to_folder_for_user(folder, request.user) # also sets link.organization # handle uploaded file uploaded_file = request.data.get('file') if uploaded_file: link.write_uploaded_file(uploaded_file) # handle submitted url else: # create primary capture placeholder Capture( link=link, role='primary', status='pending', record_type='response', url=link.submitted_url, ).save() # create screenshot placeholder Capture( link=link, role='screenshot', status='pending', record_type='resource', url="file:///%s/cap.png" % link.guid, content_type='image/png', ).save() # kick off capture tasks -- no need for guid since it'll work through the queue capture_job.status = 'pending' capture_job.link = link capture_job.save(update_fields=['status', 'link']) run_task(run_next_capture.s()) return Response(serializer.data, status=status.HTTP_201_CREATED) raise_invalid_capture_job(capture_job, serializer.errors)
def post(self, request, format=None): """ Create new link. """ data = request.data # Set target folder, in order of preference: # - 'folder' key in data # - parent folder, if posting to /folders/:parent_id/archives # - user's personal folder folder = self.get_folder_from_request( request) or request.parent or request.user.root_folder # Make sure a limited user has links left to create if not folder.organization: links_remaining = request.user.get_links_remaining() if links_remaining < 1: raise_validation_error("You've already reached your limit.") else: registrar = folder.organization.registrar if not registrar.link_creation_allowed(): error = 'Perma.cc cannot presently make links on behalf of {}. '.format( registrar.name) if request.user.registrar: contact = 'Visit your settings for subscription information.' else: registrar_users = [ user.email for user in registrar.active_registrar_users() ] contact = 'For assistance with your subscription, contact: {}.'.format( ", ".join(registrar_users)) raise_validation_error(error + contact) serializer = self.serializer_class(data=data, context={'request': request}) if serializer.is_valid(): link = serializer.save(created_by=request.user) # put link in folder and handle Org settings based on folder if folder.organization and folder.organization.default_to_private: link.is_private = True link.save() link.move_to_folder_for_user( folder, request.user) # also sets link.organization # handle uploaded file uploaded_file = request.data.get('file') if uploaded_file: link.write_uploaded_file(uploaded_file) # handle submitted url else: # create primary capture placeholder Capture( link=link, role='primary', status='pending', record_type='response', url=link.submitted_url, ).save() # create screenshot placeholder Capture( link=link, role='screenshot', status='pending', record_type='resource', url="file:///%s/cap.png" % link.guid, content_type='image/png', ).save() # create CaptureJob CaptureJob(link=link, human=request.data.get('human', False)).save() # kick off capture tasks -- no need for guid since it'll work through the queue run_task(run_next_capture.s()) return Response(serializer.data, status=status.HTTP_201_CREATED) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def obj_create(self, bundle, **kwargs): # We've received a request to archive a URL. That process is managed here. # We create a new entry in our datastore and pass the work off to our indexing # workers. They do their thing, updating the model as they go. When we get some minimum # set of results we can present the user (a guid for the link), we respond back. if settings.READ_ONLY_MODE: raise ImmediateHttpResponse(response=self.error_response(bundle.request, { 'archives': {'__all__': "Perma has paused archive creation for scheduled maintenance. Please try again shortly."}, 'reason': "Perma has paused archive creation for scheduled maintenance. Please try again shortly.", })) # Make sure a limited user has links left to create links_remaining = bundle.request.user.get_links_remaining() if (bundle.request.user.has_limit() or not bundle.data.get('organization')) and links_remaining < 1: raise ImmediateHttpResponse(response=self.error_response(bundle.request, { 'archives': {'__all__': "You've already reached your limit."}, 'reason': "You've already reached your limit.", })) # Return the number remaining links after this one is created if bundle.request.user.has_limit() or not bundle.data.get('organization'): bundle.data['links_remaining'] = links_remaining - 1 else: bundle.data['links_remaining'] = 'unlimited' # Runs validation (exception thrown if invalid), sets properties and saves the object bundle = super(LinkResource, self).obj_create(bundle, created_by=bundle.request.user) link = bundle.obj # put link in folder and handle Org settings based on folder folder = bundle.data.get('folder') if folder: if folder.organization and folder.organization.default_to_private: link.is_private = True link.save() link.move_to_folder_for_user(folder, bundle.request.user) # also sets link.organization uploaded_file = bundle.data.get('file') if uploaded_file: # normalize file name to upload.jpg, upload.png, upload.gif, or upload.pdf mime_type = get_mime_type(uploaded_file.name) file_name = 'upload.%s' % mime_type_lookup[mime_type]['new_extension'] warc_url = "file:///%s/%s" % (link.guid, file_name) capture = Capture(link=link, role='primary', status='success', record_type='resource', user_upload='True', content_type=mime_type, url=warc_url) uploaded_file.file.seek(0) capture.write_warc_resource_record(uploaded_file) capture.save() else: # create primary capture placeholder Capture( link=link, role='primary', status='pending', record_type='response', url=link.submitted_url, ).save() # create screenshot placeholder Capture( link=link, role='screenshot', status='pending', record_type='resource', url="file:///%s/cap.png" % link.guid, content_type='image/png', ).save() # kick off capture task run_task(proxy_capture.s(link.guid, bundle.request.META.get('HTTP_USER_AGENT', ''))) return bundle
def proxy_capture(capture_job): """ Start warcprox process. Warcprox is a MITM proxy server and needs to be running before, during and after the headless browser. Start a headless browser to capture the supplied URL. Also take a screenshot if the URL is an HTML file. This whole function runs with the local dir set to a temp dir by run_in_tempdir(). So we can use local paths for temp files, and they'll just disappear when the function exits. """ # basic setup link = capture_job.link target_url = link.safe_url if link.user_deleted or link.primary_capture.status != "pending": capture_job.mark_completed('deleted') return capture_job.attempt += 1 capture_job.save() # Helper function to update capture_job's progress def display_progress(step_count, step_description): save_fields(capture_job, step_count=step_count, step_description=step_description) print "%s step %s: %s" % (link.guid, step_count, step_description) print "%s: Fetching %s" % (link.guid, target_url) progress = 0 display_progress(progress, "Starting capture") # suppress verbose warcprox logs logging.disable(logging.INFO) # Set up an exception we can trigger to halt capture and release all the resources involved. class HaltCaptureException(Exception): pass browser = warcprox_controller = warcprox_thread = display = have_html = None have_warc = False thread_list = [] successful_favicon_urls = [] try: # create a request handler class that counts requests and responses proxied_requests = [] proxied_responses = [] proxied_pairs = [] count_lock = threading.Lock() class CountingRequestHandler(WarcProxyHandler): def _proxy_request(self): # make sure we don't capture anything in a banned IP range if not url_in_allowed_ip_range(self.url): return with count_lock: proxied_pair = [self.url, None] proxied_requests.append(proxied_pair[0]) proxied_pairs.append(proxied_pair) response = WarcProxyHandler._proxy_request(self) with count_lock: proxied_responses.append(response) proxied_pair[1] = response # connect warcprox to an open port warcprox_port = 27500 recorded_url_queue = queue.Queue() for i in xrange(500): try: proxy = WarcProxy(server_address=("127.0.0.1", warcprox_port), recorded_url_q=recorded_url_queue, req_handler_class=CountingRequestHandler) break except socket_error as e: if e.errno != errno.EADDRINUSE: raise warcprox_port += 1 else: raise Exception("WarcProx couldn't find an open port.") proxy_address = "127.0.0.1:%s" % warcprox_port # start warcprox in the background warc_writer = WarcWriter(gzip=True, port=warcprox_port) warc_writer_thread = WarcWriterThread( recorded_url_q=recorded_url_queue, warc_writer=warc_writer) warcprox_controller = WarcproxController(proxy, warc_writer_thread) warcprox_thread = threading.Thread( target=warcprox_controller.run_until_shutdown, name="warcprox", args=()) warcprox_thread.start() print "WarcProx opened." # Helper function to get a url, via proxied requests.get(), in a way that is interruptable from other threads. # This should only be run from sub-threads. def get_url(url): request_thread = add_thread( thread_list, ProxiedRequestThread(proxy_address, url)) request_thread.join() return request_thread.response, request_thread.response_exception # start virtual display if settings.CAPTURE_BROWSER != "PhantomJS": display = Display(visible=0, size=(1024, 800)) display.start() # fetch page in the background progress += 1 display_progress(progress, "Fetching target URL") browser = get_browser(settings.CAPTURE_USER_AGENT, proxy_address, proxy.ca.ca_file) browser.set_window_size(1024, 800) start_time = time.time() page_load_thread = threading.Thread( target=browser.get, args=(target_url, )) # returns after onload page_load_thread.start() page_load_thread.join(ONLOAD_EVENT_TIMEOUT) # wait until warcprox records a response that isn't a forward have_response = False while not have_response: if proxied_responses: for request, response in proxied_pairs: if response is None: # Response hasn't finished yet -- we might get here because subsequent # responses have finished, but we have to go in order to find the correct content_type, # so let's wait for this one. break if response.url.endswith( '/favicon.ico') and response.url != target_url: continue if not hasattr(response, 'parsed_response'): response.parsed_response = parse_response( response.response_recorder.headers) if response.parsed_response.is_redirect or response.parsed_response.status_code == 206: # partial content continue content_url = response.url content_type = response.parsed_response.headers.get( 'content-type') robots_directives = response.parsed_response.headers.get( 'x-robots-tag') have_html = content_type and content_type.startswith( 'text/html') have_response = True break if have_response: have_warc = True # at this point we have something that's worth showing to the user break wait_time = time.time() - start_time if wait_time > RESOURCE_LOAD_TIMEOUT: raise HaltCaptureException progress = int(progress) + wait_time / RESOURCE_LOAD_TIMEOUT display_progress(progress, "Fetching target URL") time.sleep(1) print "Finished fetching url." # check for x-robots-tag directives progress = int(progress) + 1 display_progress(progress, "Checking x-robots-tag directives.") if robots_directives: darchive = False for directive in robots_directives.split(";"): parsed = directive.lower().split(":") # respect tags that target all crawlers (no user-agent specified) if len(parsed) == 1: if "noarchive" in parsed: darchive = True # look for perma user-agent elif len(parsed) == 2: if parsed[0] == "perma" and "noarchive" in parsed[1]: darchive = True # if the directive is poorly formed, do our best else: if "perma" in directive and "noarchive" in directive: darchive = True if darchive: save_fields(link, is_private=True, private_reason='policy') print "x-robots-tag found, darchiving" # get favicon urls # Here we fetch everything in the page that's marked as a favicon, for archival purposes. # But we only record a favicon as our favicon_capture_url if it passes a mimetype whitelist. def favicon_thread(): favicon_urls = [] if have_html and browser_still_running(browser): favicons = repeat_while_exception( lambda: browser.find_elements_by_css_selector( 'link[rel="shortcut icon"],link[rel="icon"]'), timeout=10) for candidate_favicon in favicons: if candidate_favicon.get_attribute('href'): candidate_favicon_url = urlparse.urljoin( content_url, candidate_favicon.get_attribute('href')) favicon_urls.append(candidate_favicon_url) favicon_urls.append(urlparse.urljoin(content_url, '/favicon.ico')) if not favicon_urls: return for favicon_url in favicon_urls: print "Fetching favicon from %s ..." % favicon_url favicon_response, e = get_url(favicon_url) if e or not favicon_response or not favicon_response.ok: print "Favicon failed:", e, favicon_response continue # apply mime type whitelist mime_type = favicon_response.headers.get('content-type', '').split(';')[0] if mime_type not in VALID_FAVICON_MIME_TYPES: continue successful_favicon_urls.append((favicon_url, mime_type)) if not successful_favicon_urls: print "Couldn't get favicon" add_thread(thread_list, favicon_thread) # fetch robots.txt in the background def robots_txt_thread(): print "Fetching robots.txt ..." robots_txt_location = urlparse.urljoin(content_url, '/robots.txt') robots_txt_response, e = get_url(robots_txt_location) if e or not robots_txt_response or not robots_txt_response.ok: print "Couldn't reach robots.txt" return # We only want to respect robots.txt if Perma is specifically asked not to archive (we're not a crawler) if 'Perma' in robots_txt_response.content: # We found Perma specifically mentioned rp = robotparser.RobotFileParser() rp.parse([ line.strip() for line in robots_txt_response.content.split('\n') ]) if not rp.can_fetch('Perma', target_url): save_fields(link, is_private=True, private_reason='policy') print "Robots.txt fetched." add_thread(thread_list, robots_txt_thread) if have_html: # check meta tags print "Checking meta tags." def meta_analysis_failed(): if settings.PRIVATE_LINKS_ON_FAILURE: save_fields(link, is_private=True, private_reason='failure') link.tags.add('meta-tag-retrieval-failure') print "Meta tag retrieval failure." if browser_still_running(browser): def meta_thread(): def js_get_tags(): return browser.execute_script(""" var meta_tags = document.getElementsByTagName('meta'); var tags = []; for (var i = 0; i < meta_tags.length; i++){ tags.push({"name":meta_tags[i].name, "content":meta_tags[i].content}); } return tags """) # get all meta tags meta_tags = repeat_while_exception( lambda: browser.find_elements_by_tag_name('meta'), timeout=30) # if that retrieves even one meta tag, we need to succeed at parsing # them before we can confidently make a link public if meta_tags: meta_list = None try: # assemble required attributes for processing. # this sometimes fails because javascript alters the DOM sufficiently # that the elements found above are no longer available by the time # we wish to iterate through them. in that case, a StaleElementReferenceException # is thrown. # http://www.seleniumhq.org/exceptions/stale_element_reference.jsp # but, since this is important to get right, catch all exceptions, # not just StaleElementReferenceException. If an exception is thrown, we want # to try the javascript method regardless. meta_list = [{ "name": tag.get_attribute('name'), "content": tag.get_attribute("content") } for tag in meta_tags] except: meta_list = repeat_until_truthy(js_get_tags, sleep_time=1) if not meta_list: # if still no meta tags at all, then this process has failed. meta_analysis_failed() # first look for <meta name='perma'> meta_tag = next((tag for tag in meta_list if tag['name'].lower() == 'perma'), None) # else look for <meta name='robots'> if not meta_tag: meta_tag = next( (tag for tag in meta_list if tag['name'].lower() == 'robots'), None) # if we found a relevant meta tag, check for noarchive if meta_tag and 'noarchive' in meta_tag[ "content"].lower(): save_fields(link, is_private=True, private_reason='policy') print "Meta found, darchiving" description_meta_tag = next( (tag for tag in meta_list if tag['name'].lower() == 'description'), '') if description_meta_tag and description_meta_tag[ 'content']: save_fields( link, submitted_description=description_meta_tag[ 'content']) add_thread(thread_list, meta_thread) else: meta_analysis_failed() # Skip all these things if the browser has died. if browser_still_running(browser): # get page title progress = int(progress) + 1 display_progress(progress, "Getting page title") def get_title(): if browser.title: save_fields(link, submitted_title=browser.title) else: title_element = browser.find_element_by_tag_name( "title") save_fields( link, submitted_title=title_element.get_attribute( "text")) repeat_while_exception(get_title, timeout=10, raise_after_timeout=False) # scroll to bottom of page, in case that prompts anything else to load # TODO: This doesn't scroll horizontally or scroll frames progress += .5 display_progress(progress, "Checking for scroll-loaded assets") def scroll_browser(): try: scroll_delay = browser.execute_script(""" // Scroll down the page in a series of jumps the size of the window height. // The actual scrolling is done in a setTimeout with a 50ms delay so the browser has // time to render at each position. var delay=50, height=document.body.scrollHeight, jump=window.innerHeight, scrollTo=function(scrollY){ window.scrollTo(0, scrollY) }, i=1; for(;i*jump<height;i++){ setTimeout(scrollTo, i*delay, i*jump); } // Scroll back to top before taking screenshot. setTimeout(scrollTo, i*delay, 0); // Return how long all this scrolling will take. return (i*delay)/1000; """) # In python, wait for javascript background scrolling to finish. time.sleep(min(scroll_delay, 1)) except (WebDriverException, URLError): # Don't panic if we can't scroll -- we've already captured something useful anyway. # WebDriverException: the page can't execute JS for some reason. # URLError: the headless browser has gone away for some reason. pass repeat_while_exception(scroll_browser) # load media progress = int(progress) + 1 display_progress(progress, "Fetching media") with warn_on_exception("Error fetching media"): # running in each frame ... def get_media_tags(browser): url_set = [] base_url = browser.current_url def make_absolute_urls(urls): '''collect resource urls, converted to absolute urls relative to current browser frame''' return [ urlparse.urljoin(base_url, url) for url in urls if url ] # get all images in srcsets print("Fetching images in srcsets") for img in browser.find_elements_by_css_selector( 'img[srcset], source[srcset]'): urls = [ src.strip().split(' ')[0] for src in img.get_attribute('srcset').split(',') ] url_set.extend(make_absolute_urls(urls)) # fetch each audio/video/object/embed element if settings.ENABLE_AV_CAPTURE: print("Fetching audio/video objects") media_tags = sum( (browser.find_elements_by_tag_name(tag_name) for tag_name in ('video', 'audio', 'object', 'embed')), []) for tag in media_tags: # for each tag, extract all resource urls if tag.tag_name == 'object': # for <object>, get the data and archive attributes, prepended with codebase attribute if it exists, # as well as any <param name="movie" value="url"> elements codebase_url = tag.get_attribute( 'codebase') or base_url urls = [ urlparse.urljoin(codebase_url, url) for url in [tag.get_attribute('data')] + (tag.get_attribute('archive') or '' ).split() ] + [ param.get_attribute('value') for param in tag.find_elements_by_css_selector( 'param[name="movie"]') ] else: # for <audio>, <video>, and <embed>, get src attribute and any <source src="url"> elements urls = [tag.get_attribute('src')] + [ source.get_attribute('src') for source in tag.find_elements_by_tag_name('source') ] url_set.extend(make_absolute_urls(urls)) return url_set media_urls = run_in_frames(browser, get_media_tags) # grab all media urls that aren't already being grabbed for media_url in set(media_urls) - set(proxied_requests): add_thread( thread_list, ProxiedRequestThread(proxy_address, media_url)) # Wait AFTER_LOAD_TIMEOUT seconds for any requests to finish that are started within the next .5 seconds. progress = int(progress) + 1 display_progress(progress, "Waiting for post-load requests") time.sleep(.5) unfinished_proxied_pairs = [ pair for pair in proxied_pairs if not pair[1] ] start_time = time.time() while unfinished_proxied_pairs and browser_still_running(browser): print "Waiting for %s pending requests" % len( unfinished_proxied_pairs) # give up after AFTER_LOAD_TIMEOUT seconds wait_time = time.time() - start_time if wait_time > AFTER_LOAD_TIMEOUT: print "Waited %s seconds to finish post-load requests -- giving up." % AFTER_LOAD_TIMEOUT break # Give up if downloaded size exceeds MAX_ARCHIVE_FILE_SIZE. # Amount captured so far is the sum of the bytes recorded by warcprox, # and the bytes pending in our background threads. capture_size = sum(getattr(thread, 'pending_data', 0) for thread in thread_list) + \ sum(proxied_response.response_recorder.len for proxied_response in proxied_responses) if capture_size > settings.MAX_ARCHIVE_FILE_SIZE: print "Halting for size" break # Show progress to user progress = int(progress) + wait_time / AFTER_LOAD_TIMEOUT display_progress(progress, "Waiting for post-load requests") # Sleep and update our list time.sleep(.5) unfinished_proxied_pairs = [ pair for pair in unfinished_proxied_pairs if not pair[1] ] # screenshot capture if have_html and browser_still_running(browser): # get page size to decide whether to take a screenshot capture_screenshot = False pixel_count = 0 try: root_element = browser.find_element_by_tag_name('body') except (NoSuchElementException, URLError): try: root_element = browser.find_element_by_tag_name('frameset') except (NoSuchElementException, URLError): # If we can't find the root element, we just won't capture a screenshot. # NoSuchElementException: HTML structure is weird somehow. # URLError: the headless browser has gone away for some reason. root_element = None if root_element: page_size = root_element.size pixel_count = page_size['width'] * page_size['height'] capture_screenshot = pixel_count < settings.MAX_IMAGE_SIZE # take screenshot after all requests done if capture_screenshot: progress += 1 display_progress(progress, "Taking screenshot") screenshot_data = browser.get_screenshot_as_png() link.screenshot_capture.write_warc_resource_record( screenshot_data) save_fields(link.screenshot_capture, status='success') else: print "Not saving screenshots! %s" % ( "Page size is %s pixels." % pixel_count if pixel_count else "") save_fields(link.screenshot_capture, status='failed') else: # no screenshot if not HTML save_fields(link.screenshot_capture, status='failed') except HaltCaptureException: pass finally: # teardown (have to do this before save to make sure WARC is done writing): print "Shutting down browser and proxies." for thread in thread_list: # wait until threads are done (have to do this before closing phantomjs) if hasattr(thread, 'stop'): thread.stop.set() thread.join() if browser: browser.quit() # shut down phantomjs if display: display.stop() # shut down virtual display if warcprox_controller: warcprox_controller.stop.set( ) # send signal to shut down warc thread if warcprox_thread: warcprox_thread.join( ) # wait until warcprox thread is done writing out warc # un-suppress logging logging.disable(logging.NOTSET) # save generated warc file if have_warc: progress += 1 display_progress(progress, "Saving web archive file") temp_warc_path = os.path.join(warc_writer.directory, warc_writer._f_finalname) with open(temp_warc_path, 'rb') as warc_file: link.write_warc_raw_data(warc_file) print "Writing CDX lines to the DB" CDXLine.objects.create_all_from_link(link) save_fields( link.primary_capture, status='success', content_type=content_type, ) save_fields(link, warc_size=default_storage.size(link.warc_storage_file())) capture_job.mark_completed() # We only save the Capture for the favicon once the warc is stored, # since the data for the favicon lives in the warc. if successful_favicon_urls: Capture(link=link, role='favicon', status='success', record_type='response', url=successful_favicon_urls[0][0], content_type=successful_favicon_urls[0][1]).save() print "Saved favicon at %s" % successful_favicon_urls print "%s capture succeeded." % link.guid
def proxy_capture(self, link_guid, user_agent=''): """ start warcprox process. Warcprox is a MITM proxy server and needs to be running before, during and after phantomjs gets a screenshot. Create an image from the supplied URL, write it to disk and update our asset model with the path. The heavy lifting is done by PhantomJS, our headless browser. This whole function runs with the local dir set to a temp dir by run_in_tempdir(). So we can use local paths for temp files, and they'll just disappear when the function exits. """ # basic setup link = Link.objects.get(guid=link_guid) target_url = link.safe_url # allow pending tasks to be canceled outside celery by updating capture status if link.primary_capture.status != "pending": return # Override user_agent for now, since PhantomJS doesn't like some user agents. # This user agent is the Chrome on Linux that's most like PhantomJS 2.1.1. user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.37 Safari/537.36" print "%s: Fetching %s" % (link_guid, target_url) # suppress verbose warcprox logs logging.disable(logging.INFO) # Set up an exception we can trigger to halt capture and release all the resources involved. class HaltCaptureException(Exception): pass browser = warcprox_controller = warcprox_thread = display = have_html = None have_warc = False thread_list = [] successful_favicon_urls = [] try: # create a request handler class that counts requests and responses proxied_requests = [] proxied_responses = [] count_lock = threading.Lock() class CountingRequestHandler(WarcProxyHandler): def _proxy_request(self): with count_lock: proxied_requests.append(self.url) response = WarcProxyHandler._proxy_request(self) with count_lock: proxied_responses.append(response) # connect warcprox to an open port warcprox_port = 27500 recorded_url_queue = queue.Queue() for i in xrange(500): try: proxy = WarcProxy(server_address=("127.0.0.1", warcprox_port), recorded_url_q=recorded_url_queue, req_handler_class=CountingRequestHandler) break except socket_error as e: if e.errno != errno.EADDRINUSE: raise warcprox_port += 1 else: raise self.retry( exc=Exception("WarcProx couldn't find an open port.")) proxy_address = "127.0.0.1:%s" % warcprox_port # set up requests getter for one-off requests outside of selenium def proxied_get_request(url): return requests.get(url, headers={'User-Agent': user_agent}, proxies={ 'http': 'http://' + proxy_address, 'https': 'http://' + proxy_address }, verify=False) # start warcprox in the background warc_writer = WarcWriter(gzip=True, port=warcprox_port) warc_writer_thread = WarcWriterThread( recorded_url_q=recorded_url_queue, warc_writer=warc_writer) warcprox_controller = WarcproxController(proxy, warc_writer_thread) warcprox_thread = threading.Thread( target=warcprox_controller.run_until_shutdown, name="warcprox", args=()) warcprox_thread.start() print "WarcProx opened." # start virtual display if settings.CAPTURE_BROWSER != "PhantomJS": display = Display(visible=0, size=(1024, 800)) display.start() # fetch page in the background print "Fetching url." browser = get_browser(user_agent, proxy_address, proxy.ca.ca_file) browser.set_window_size(1024, 800) start_time = time.time() page_load_thread = threading.Thread( target=browser.get, args=(target_url, )) # returns after onload page_load_thread.start() page_load_thread.join(ONLOAD_EVENT_TIMEOUT) # wait until warcprox records a response that isn't a forward have_response = False while not have_response: if proxied_responses: for response in proxied_responses: if response.url.endswith( '/favicon.ico') and response.url != target_url: continue if not hasattr(response, 'parsed_response'): response.parsed_response = parse_response( response.response_recorder.headers) if response.parsed_response.is_redirect or response.parsed_response.status_code == 206: # partial content continue content_url = response.url content_type = response.parsed_response.headers.get( 'content-type') have_html = content_type and content_type.startswith( 'text/html') have_response = True break if time.time() - start_time > RESOURCE_LOAD_TIMEOUT: raise HaltCaptureException time.sleep(1) print "Finished fetching url." # get favicon urls # Here we fetch everything in the page that's marked as a favicon, for archival purposes. # But we only record a favicon as our favicon_capture_url if it passes a mimetype whitelist. def favicon_thread(): favicon_urls = [] if have_html: favicons = repeat_while_exception( lambda: browser.find_elements_by_css_selector( 'link[rel="shortcut icon"],link[rel="icon"]'), timeout=10) for candidate_favicon in favicons: if candidate_favicon.get_attribute('href'): candidate_favicon_url = urlparse.urljoin( content_url, candidate_favicon.get_attribute('href')) favicon_urls.append(candidate_favicon_url) favicon_urls.append(urlparse.urljoin(content_url, '/favicon.ico')) if not favicon_urls: return for favicon_url in favicon_urls: print "Fetching favicon from %s ..." % favicon_url try: favicon_response = proxied_get_request(favicon_url) assert favicon_response.ok except (requests.ConnectionError, requests.Timeout, AssertionError) as e: print "Failed:", e continue # apply mime type whitelist mime_type = favicon_response.headers.get('content-type', '').split(';')[0] if not mime_type in VALID_FAVICON_MIME_TYPES: continue successful_favicon_urls.append((favicon_url, mime_type)) if not successful_favicon_urls: print "Couldn't get favicon" add_thread(thread_list, favicon_thread) # fetch robots.txt in the background def robots_txt_thread(): print "Fetching robots.txt ..." robots_txt_location = urlparse.urljoin(content_url, '/robots.txt') try: robots_txt_response = proxied_get_request(robots_txt_location) assert robots_txt_response.ok except (requests.ConnectionError, requests.Timeout, AssertionError): print "Couldn't reach robots.txt" return # We only want to respect robots.txt if Perma is specifically asked not to archive (we're not a crawler) if 'Perma' in robots_txt_response.content: # We found Perma specifically mentioned rp = robotparser.RobotFileParser() rp.parse([ line.strip() for line in robots_txt_response.content.split('\n') ]) if not rp.can_fetch('Perma', target_url): save_fields(link, is_private=True, private_reason='policy') print "Robots.txt fetched." add_thread(thread_list, robots_txt_thread) if have_html: # get page title print "Getting title." def get_title(): if browser.title: save_fields(link, submitted_title=browser.title) repeat_while_exception(get_title, timeout=10) # check meta tags print "Checking meta tags." def meta_thread(): # get all meta tags meta_tags = repeat_while_exception( lambda: browser.find_elements_by_tag_name('meta'), timeout=10) # first look for <meta name='perma'> meta_tag = next( (tag for tag in meta_tags if tag.get_attribute('name').lower() == 'perma'), None) # else look for <meta name='robots'> if not meta_tag: meta_tag = next( (tag for tag in meta_tags if tag.get_attribute('name').lower() == 'robots'), None) # if we found a relevant meta tag, check for noarchive if meta_tag and 'noarchive' in meta_tag.get_attribute( "content").lower(): save_fields(link, is_private=True, private_reason='policy') print "Meta found, darchiving" add_thread(thread_list, meta_thread) # scroll to bottom of page, in case that prompts anything else to load # TODO: This doesn't scroll horizontally or scroll frames def scroll_browser(): try: scroll_delay = browser.execute_script(""" // Scroll down the page in a series of jumps the size of the window height. // The actual scrolling is done in a setTimeout with a 50ms delay so the browser has // time to render at each position. var delay=50, height=document.body.scrollHeight, jump=window.innerHeight, scrollTo=function(scrollY){ window.scrollTo(0, scrollY) }, i=1; for(;i*jump<height;i++){ setTimeout(scrollTo, i*delay, i*jump); } // Scroll back to top before taking screenshot. setTimeout(scrollTo, i*delay, 0); // Return how long all this scrolling will take. return (i*delay)/1000; """) # In python, wait for javascript background scrolling to finish. time.sleep(min(scroll_delay, 1)) except WebDriverException: pass repeat_while_exception(scroll_browser) # make sure all requests are finished print "Waiting for post-load requests." start_time = time.time() time.sleep(1) while True: print "%s/%s finished" % (len(proxied_responses), len(proxied_requests)) response_count = len(proxied_responses) if time.time() - start_time > AFTER_LOAD_TIMEOUT: print "Waited %s seconds to finish post-load requests -- giving up." % AFTER_LOAD_TIMEOUT break time.sleep(.5) if response_count == len(proxied_requests): break if have_html: # get page size to decide whether to take a screenshot capture_screenshot = False pixel_count = 0 try: root_element = browser.find_element_by_tag_name('body') except NoSuchElementException: try: root_element = browser.find_element_by_tag_name('frameset') except NoSuchElementException: root_element = None if root_element: page_size = root_element.size pixel_count = page_size['width'] * page_size['height'] capture_screenshot = pixel_count < settings.MAX_IMAGE_SIZE # take screenshot after all requests done if capture_screenshot: print "Taking screenshot." screenshot_data = browser.get_screenshot_as_png() link.screenshot_capture.write_warc_resource_record( screenshot_data) save_fields(link.screenshot_capture, status='success') else: print "Not saving screenshots! %s" % ( "Page size is %s pixels." % pixel_count if pixel_count else "") save_fields(link.screenshot_capture, status='failed') else: # no screenshot if not HTML save_fields(link.screenshot_capture, status='failed') have_warc = True except HaltCaptureException: pass except Exception as e: traceback.print_exc() raise finally: # teardown (have to do this before save to make sure WARC is done writing): print "Shutting down browser and proxies." for thread in thread_list: thread.join( ) # wait until threads are done (have to do this before closing phantomjs) if browser: browser.quit() # shut down phantomjs if display: display.stop() # shut down virtual display if warcprox_controller: warcprox_controller.stop.set( ) # send signal to shut down warc thread if warcprox_thread: warcprox_thread.join( ) # wait until warcprox thread is done writing out warc # un-suppress logging logging.disable(logging.NOTSET) # save generated warc file if have_warc: print "Saving WARC." try: temp_warc_path = os.path.join(warc_writer.directory, warc_writer._f_finalname) with open(temp_warc_path, 'rb') as warc_file: link.write_warc_raw_data(warc_file) save_fields(link.primary_capture, status='success', content_type=content_type) # We only save the Capture for the favicon once the warc is stored, # since the data for the favicon lives in the warc. if successful_favicon_urls: Capture(link=link, role='favicon', status='success', record_type='response', url=successful_favicon_urls[0][0], content_type=successful_favicon_urls[0][1]).save() print "Saved favicon at %s" % successful_favicon_urls print "Writing CDX lines to the DB" CDXLine.objects.create_all_from_link(link) except Exception as e: print "Web Archive File creation failed for %s: %s" % (target_url, e) save_fields(link.primary_capture, status='failed') print "%s capture done." % link_guid