def create_capture_job(user, human=True): link = Link(created_by=user, submitted_url="http://example.com") link.save() capture_job = CaptureJob(created_by=user, link=link, human=human, status='pending') capture_job.save() return capture_job
def run_next_capture(): """ Grab and run the next CaptureJob. This will keep calling itself until there are no jobs left. """ capture_job = CaptureJob.get_next_job(reserve=True) if not capture_job: return # no jobs waiting proxy_capture.apply([capture_job.link_id]) run_task(run_next_capture.s())
def test_hard_timeout(self): create_capture_job(self.user_one) # simulate a failed run_next_capture() job = CaptureJob.get_next_job(reserve=True) # capture_start_time should be set accurately on the server side self.assertLess( (job.capture_start_time - timezone.now()).total_seconds(), 60) # clean_up_failed_captures shouldn't affect job, since timeout hasn't passed clean_up_failed_captures() job.refresh_from_db() self.assertEqual(job.status, "in_progress") # once job is sufficiently old, clean_up_failed_captures should mark it as failed job.capture_start_time -= timedelta( seconds=settings.CELERY_TASK_TIME_LIMIT + 60) job.save() clean_up_failed_captures() job.refresh_from_db() self.assertEqual(job.status, "failed") # failed jobs will have a message indicating failure reason self.assertEqual( json.loads(job.message)[api_settings.NON_FIELD_ERRORS_KEY][0], "Timed out.")
def test_job_queue_order(self): """ Jobs should be processed round-robin, one per user. """ jobs = [ create_capture_job(self.user_one), create_capture_job(self.user_one), create_capture_job(self.user_one), create_capture_job(self.user_two), create_capture_job(self.user_two, human=False), create_capture_job(self.user_one), create_capture_job(self.user_one), create_capture_job(self.user_one), create_capture_job(self.user_two), ] expected_order = [ 0, 3, # u1, u2 1, 8, # u1, u2 2, 5, 6, 7, # remaining u1 jobs 4 # robots queue ] # test CaptureJob.queue_position for i, job in enumerate(jobs): queue_position = job.queue_position() expected_queue_position = expected_order.index(i)+1 self.assertEqual(queue_position, expected_queue_position, "Job %s has queue position %s, should be %s." % (i, queue_position, expected_queue_position)) # test CaptureJob.get_next_job expected_next_jobs = [jobs[i] for i in expected_order] next_jobs = [CaptureJob.get_next_job(reserve=True) for i in range(len(jobs))] self.assertListEqual(next_jobs, expected_next_jobs)
def run_next_capture(): """ Grab and run the next CaptureJob. This will keep calling itself until there are no jobs left. """ capture_job = CaptureJob.get_next_job(reserve=True) if not capture_job: return # no jobs waiting try: proxy_capture(capture_job) except: print "Exception while processing capture job %s:" % capture_job.link_id traceback.print_exc() finally: capture_job.link.captures.filter(status='pending').update(status='failed') if capture_job.status == 'pending': capture_job.mark_completed('failed') run_task(run_next_capture.s())
def run_next_capture(): """ Grab and run the next CaptureJob. This will keep calling itself until there are no jobs left. """ capture_job = CaptureJob.get_next_job(reserve=True) if not capture_job: return # no jobs waiting try: proxy_capture(capture_job) except: print "Exception while processing capture job %s:" % capture_job.link_id traceback.print_exc() finally: capture_job.link.captures.filter(status='pending').update( status='failed') if capture_job.status == 'pending': capture_job.mark_completed('failed') run_task(run_next_capture.s())
def obj_create(self, bundle, **kwargs): # We've received a request to archive a URL. That process is managed here. # We create a new entry in our datastore and pass the work off to our indexing # workers. They do their thing, updating the model as they go. When we get some minimum # set of results we can present the user (a guid for the link), we respond back. if settings.READ_ONLY_MODE: raise ImmediateHttpResponse(response=self.error_response( bundle.request, { 'archives': { '__all__': "Perma has paused archive creation for scheduled maintenance. Please try again shortly." }, 'reason': "Perma has paused archive creation for scheduled maintenance. Please try again shortly.", })) # Runs validation (exception thrown if invalid), sets properties and saves the object if not bundle.data.get('replace'): bundle = super(LinkResource, self).obj_create(bundle, created_by=bundle.request.user) link = bundle.obj link.save() # put link in folder and handle Org settings based on folder folder = bundle.data.get('folder') if folder.organization and folder.organization.default_to_private: link.is_private = True link.save() link.move_to_folder_for_user( folder, bundle.request.user) # also sets link.organization uploaded_file = bundle.data.get('file') if uploaded_file: # normalize file name to upload.jpg, upload.png, upload.gif, or upload.pdf mime_type = get_mime_type(uploaded_file.name) file_name = 'upload.%s' % mime_type_lookup[mime_type][ 'new_extension'] base_warc_url = "file:///%s/%s" % (link.guid, file_name) # only append a random number to warc_url if we're replacing a file warc_url = base_warc_url if not bundle.data.get( 'replace') else "%s?version=%s" % ( base_warc_url, str(random.random()).replace('.', '')) capture = Capture(link=link, role='primary', status='success', record_type='resource', user_upload='True', content_type=mime_type, url=warc_url) uploaded_file.file.seek(0) capture.write_warc_resource_record(uploaded_file) capture.save() else: # create primary capture placeholder Capture( link=link, role='primary', status='pending', record_type='response', url=link.submitted_url, ).save() # create screenshot placeholder Capture( link=link, role='screenshot', status='pending', record_type='resource', url="file:///%s/cap.png" % link.guid, content_type='image/png', ).save() # create CaptureJob CaptureJob(link=link, human=bundle.data.get('human', False)).save() # kick off capture tasks -- no need for guid since it'll work through the queue run_task(run_next_capture.s()) return bundle
def post(self, request, format=None): """ Create new link. """ data = request.data capture_job = CaptureJob(human=request.data.get('human', False), submitted_url=request.data.get('url', ''), created_by=request.user) if settings.ENABLE_BATCH_LINKS: # Batch is set directly on the request object by the LinkBatch api, # to prevent abuse of this feature by those POSTing directly to this route. if getattr(request, 'batch', None): capture_job.link_batch = LinkBatch.objects.get( id=request.batch) capture_job.save() # Set target folder, in order of preference: # - 'folder' key in data # - parent folder, if posting to /folders/:parent_id/archives # - user's personal folder try: folder = self.get_folder_from_request( request) or request.parent or request.user.root_folder except ValidationError as e: raise_invalid_capture_job(capture_job, e.detail) # Make sure a limited user has links left to create if not folder.organization: if not request.user.link_creation_allowed(): if request.user.nonpaying: raise_invalid_capture_job( capture_job, "You've already reached your limit.") error = "Perma.cc cannot presently make additional Perma Links on your behalf. Visit your subscription settings page for more information." raise_invalid_capture_job(capture_job, error) else: registrar = folder.organization.registrar if not registrar.link_creation_allowed(): error = 'Perma.cc cannot presently make links on behalf of {}. '.format( registrar.name) if request.user.registrar: contact = 'Visit your settings for subscription information.' else: registrar_users = [ user.email for user in registrar.active_registrar_users() ] contact = 'For assistance with your subscription, contact: {}.'.format( ", ".join(registrar_users)) raise_invalid_capture_job(capture_job, error + contact) serializer = self.serializer_class(data=data, context={'request': request}) if serializer.is_valid(): link = serializer.save(created_by=request.user) # put link in folder and handle Org settings based on folder if folder.organization and folder.organization.default_to_private: link.is_private = True link.save() link.move_to_folder_for_user( folder, request.user) # also sets link.organization # handle uploaded file uploaded_file = request.data.get('file') if uploaded_file: link.write_uploaded_file(uploaded_file) link.warc_size = default_storage.size(link.warc_storage_file()) link.save() # handle submitted url else: # create primary capture placeholder Capture( link=link, role='primary', status='pending', record_type='response', url=link.submitted_url, ).save() # create screenshot placeholder Capture( link=link, role='screenshot', status='pending', record_type='resource', url="file:///%s/cap.png" % link.guid, content_type='image/png', ).save() # kick off capture tasks -- no need for guid since it'll work through the queue capture_job.status = 'pending' capture_job.link = link capture_job.save(update_fields=['status', 'link']) run_task(run_next_capture.s()) return Response(serializer.data, status=status.HTTP_201_CREATED) raise_invalid_capture_job(capture_job, serializer.errors)
def post(self, request, format=None): """ Create new link. """ data = request.data capture_job = CaptureJob( human=request.data.get('human', False), submitted_url=request.data.get('url', ''), created_by=request.user ) if settings.ENABLE_BATCH_LINKS: # Batch is set directly on the request object by the LinkBatch api, # to prevent abuse of this feature by those POSTing directly to this route. if getattr(request, 'batch', None): capture_job.link_batch = LinkBatch.objects.get(id=request.batch) capture_job.save() # Set target folder, in order of preference: # - 'folder' key in data # - parent folder, if posting to /folders/:parent_id/archives # - user's personal folder try: folder = self.get_folder_from_request(request) or request.parent or request.user.root_folder except ValidationError as e: raise_invalid_capture_job(capture_job, e.detail) # Disallow creation of links in top-level sponsored folder if folder.is_sponsored_root_folder: error = "You can't make links directly in your Sponsored Links folder. Select a folder belonging to a sponsor." raise_invalid_capture_job(capture_job, error) # Make sure a limited user has links left to create if not folder.organization and not folder.sponsored_by: if not request.user.link_creation_allowed(): if request.user.nonpaying: raise_invalid_capture_job(capture_job, "You've already reached your limit.") error = "Perma.cc cannot presently make additional Perma Links on your behalf. Visit your subscription settings page for more information." raise_invalid_capture_job(capture_job, error) else: registrar = folder.sponsored_by if folder.sponsored_by else folder.organization.registrar msg = None if folder.read_only: registrar_users = [user.email for user in registrar.active_registrar_users()] msg = f"Your registrar has made this folder read-only. For assistance, contact: {', '.join(registrar_users)}." if not registrar.link_creation_allowed(): error = 'Perma.cc cannot presently make links on behalf of {}. '.format(registrar.name) if request.user.registrar: contact = 'Visit your settings for subscription information.' else: registrar_users = [user.email for user in registrar.active_registrar_users()] contact = 'For assistance with your subscription, contact: {}.'.format(", ".join(registrar_users)) msg = error + contact if msg: raise_invalid_capture_job(capture_job, msg) serializer = self.serializer_class(data=data, context={'request': request}) if serializer.is_valid(): with transaction.atomic(): # Technique from https://github.com/harvard-lil/capstone/blob/0f7fb80f26e753e36e0c7a6a199b8fdccdd318be/capstone/capapi/serializers.py#L121 # # Fetch the current user data here inside a transaction, using select_for_update # to lock the row so we don't collide with any simultaneous requests user = request.user.__class__.objects.select_for_update().get(pk=request.user.pk) # If this is a Personal Link, and if the user only has bonus links left, decrement bonus links bonus_link = False if not folder.organization and not folder.sponsored_by: links_remaining, _ , bonus_links = user.get_links_remaining() if bonus_links and not links_remaining: # (this works because it's part of the same transaction with the select_for_update -- # we don't have to use the same object) request.user.bonus_links = bonus_links - 1 request.user.save(update_fields=['bonus_links']) bonus_link = True link = serializer.save(created_by=request.user, bonus_link=bonus_link) # put link in folder and handle Org settings based on folder if folder.organization and folder.organization.default_to_private: link.is_private = True link.save() link.move_to_folder_for_user(folder, request.user) # also sets link.organization # handle uploaded file uploaded_file = request.data.get('file') if uploaded_file: link.write_uploaded_file(uploaded_file) # handle submitted url else: # create primary capture placeholder Capture( link=link, role='primary', status='pending', record_type='response', url=link.submitted_url, ).save() # create screenshot placeholder Capture( link=link, role='screenshot', status='pending', record_type='resource', url="file:///%s/cap.png" % link.guid, content_type='image/png', ).save() # kick off capture tasks -- no need for guid since it'll work through the queue capture_job.status = 'pending' capture_job.link = link capture_job.save(update_fields=['status', 'link']) run_task(run_next_capture.s()) return Response(serializer.data, status=status.HTTP_201_CREATED) raise_invalid_capture_job(capture_job, serializer.errors)
def post(self, request, format=None): """ Create new link. """ data = request.data # Set target folder, in order of preference: # - 'folder' key in data # - parent folder, if posting to /folders/:parent_id/archives # - user's personal folder folder = self.get_folder_from_request( request) or request.parent or request.user.root_folder # Make sure a limited user has links left to create if not folder.organization: links_remaining = request.user.get_links_remaining() if links_remaining < 1: raise_validation_error("You've already reached your limit.") else: registrar = folder.organization.registrar if not registrar.link_creation_allowed(): error = 'Perma.cc cannot presently make links on behalf of {}. '.format( registrar.name) if request.user.registrar: contact = 'Visit your settings for subscription information.' else: registrar_users = [ user.email for user in registrar.active_registrar_users() ] contact = 'For assistance with your subscription, contact: {}.'.format( ", ".join(registrar_users)) raise_validation_error(error + contact) serializer = self.serializer_class(data=data, context={'request': request}) if serializer.is_valid(): link = serializer.save(created_by=request.user) # put link in folder and handle Org settings based on folder if folder.organization and folder.organization.default_to_private: link.is_private = True link.save() link.move_to_folder_for_user( folder, request.user) # also sets link.organization # handle uploaded file uploaded_file = request.data.get('file') if uploaded_file: link.write_uploaded_file(uploaded_file) # handle submitted url else: # create primary capture placeholder Capture( link=link, role='primary', status='pending', record_type='response', url=link.submitted_url, ).save() # create screenshot placeholder Capture( link=link, role='screenshot', status='pending', record_type='resource', url="file:///%s/cap.png" % link.guid, content_type='image/png', ).save() # create CaptureJob CaptureJob(link=link, human=request.data.get('human', False)).save() # kick off capture tasks -- no need for guid since it'll work through the queue run_task(run_next_capture.s()) return Response(serializer.data, status=status.HTTP_201_CREATED) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def post(self, request, format=None): """ Create new link. """ data = request.data capture_job = CaptureJob( human=request.data.get('human', False), submitted_url=request.data.get('url', ''), created_by=request.user ) if settings.ENABLE_BATCH_LINKS: # Batch is set directly on the request object by the LinkBatch api, # to prevent abuse of this feature by those POSTing directly to this route. if getattr(request, 'batch', None): capture_job.link_batch = LinkBatch.objects.get(id=request.batch) capture_job.save() # Set target folder, in order of preference: # - 'folder' key in data # - parent folder, if posting to /folders/:parent_id/archives # - user's personal folder try: folder = self.get_folder_from_request(request) or request.parent or request.user.root_folder except ValidationError as e: raise_invalid_capture_job(capture_job, e.detail) # Make sure a limited user has links left to create if not folder.organization: if not request.user.link_creation_allowed(): if request.user.nonpaying: raise_invalid_capture_job(capture_job, "You've already reached your limit.") error = "Perma.cc cannot presently make additional Perma Links on your behalf. Visit your subscription settings page for more information." raise_invalid_capture_job(capture_job, error) else: registrar = folder.organization.registrar if not registrar.link_creation_allowed(): error = 'Perma.cc cannot presently make links on behalf of {}. '.format(registrar.name) if request.user.registrar: contact = 'Visit your settings for subscription information.' else: registrar_users = [user.email for user in registrar.active_registrar_users()] contact = 'For assistance with your subscription, contact: {}.'.format(", ".join(registrar_users)) raise_invalid_capture_job(capture_job, error + contact) serializer = self.serializer_class(data=data, context={'request': request}) if serializer.is_valid(): link = serializer.save(created_by=request.user) # put link in folder and handle Org settings based on folder if folder.organization and folder.organization.default_to_private: link.is_private = True link.save() link.move_to_folder_for_user(folder, request.user) # also sets link.organization # handle uploaded file uploaded_file = request.data.get('file') if uploaded_file: link.write_uploaded_file(uploaded_file) # handle submitted url else: # create primary capture placeholder Capture( link=link, role='primary', status='pending', record_type='response', url=link.submitted_url, ).save() # create screenshot placeholder Capture( link=link, role='screenshot', status='pending', record_type='resource', url="file:///%s/cap.png" % link.guid, content_type='image/png', ).save() # kick off capture tasks -- no need for guid since it'll work through the queue capture_job.status = 'pending' capture_job.link = link capture_job.save(update_fields=['status', 'link']) run_task(run_next_capture.s()) return Response(serializer.data, status=status.HTTP_201_CREATED) raise_invalid_capture_job(capture_job, serializer.errors)
def get_next_job(i): return CaptureJob.get_next_job(reserve=True)