def run_next_capture(): """ Grab and run the next CaptureJob. This will keep calling itself until there are no jobs left. """ capture_job = CaptureJob.get_next_job(reserve=True) if not capture_job: return # no jobs waiting proxy_capture.apply([capture_job.link_id]) run_task(run_next_capture.s())
def patch(self, request, guid, format=None): """ Update link. """ link = self.get_object_for_user_by_pk(request.user, guid) was_private = link.is_private data = request.data serializer = self.serializer_class(link, data=data, partial=True, context={'request': self.request}) if serializer.is_valid(): serializer.save() # move to new folder folder = AuthenticatedLinkListView.get_folder_from_request(request) if folder: link.move_to_folder_for_user(folder, request.user) # handle file patch uploaded_file = request.data.get('file') if uploaded_file: # delete related cdxlines and captures, delete warc (rename) link.delete_related() link.safe_delete_warc() # write new warc and capture link.write_uploaded_file(uploaded_file, cache_break=True) # delete the link from Webrecorder and # clear the user's Webrecorder session, if any, # so that the new warc is used for this visitor's # next playback of this link. if settings.ENABLE_WR_PLAYBACK: link.delete_from_wr(request) clear_wr_session(request) # update internet archive if privacy changes if 'is_private' in data and was_private != bool(data.get("is_private")) and link.is_archive_eligible(): if was_private: # link was private but has been marked public run_task(upload_to_internet_archive.s(link_guid=link.guid)) else: # link was public but has been marked private run_task(delete_from_internet_archive.s(link_guid=link.guid)) # include remaining links in response links_remaining = request.user.get_links_remaining() serializer.data['links_remaining'] = 'Infinity' if links_remaining[0] == float('inf') else links_remaining[0] serializer.data['links_remaining_period'] = links_remaining[1] # clear out any caches that might be based on old link data link.clear_cache() return Response(serializer.data) raise ValidationError(serializer.errors)
def obj_create(self, bundle, **kwargs): # We've received a request to archive a URL. That process is managed here. # We create a new entry in our datastore and pass the work off to our indexing # workers. They do their thing, updating the model as they go. When we get some minimum # set of results we can present the user (a guid for the link), we respond back. if settings.READ_ONLY_MODE: raise ImmediateHttpResponse(response=self.error_response( bundle.request, { 'archives': { '__all__': "Perma has paused archive creation for scheduled maintenance. Please try again shortly." }, 'reason': "Perma has paused archive creation for scheduled maintenance. Please try again shortly.", })) # Runs validation (exception thrown if invalid), sets properties and saves the object bundle = super(LinkResource, self).obj_create(bundle, created_by=bundle.request.user) asset = Asset(link=bundle.obj) uploaded_file = bundle.data.get('file') if uploaded_file: # normalize file name to upload.jpg, upload.png, upload.gif, or upload.pdf mime_type = get_mime_type(uploaded_file.name) file_name = 'upload.%s' % mime_type_lookup[mime_type][ 'new_extension'] file_path = os.path.join(asset.base_storage_path, file_name) uploaded_file.file.seek(0) file_name = default_storage.store_file(uploaded_file, file_path) if mime_type == 'application/pdf': asset.pdf_capture = file_name else: asset.image_capture = file_name asset.user_upload = True asset.user_upload_file_name = uploaded_file.name asset.save() else: asset.image_capture = Asset.CAPTURE_STATUS_PENDING # If it appears as if we're trying to archive a PDF, only run our PDF retrieval tool if asset.link.media_type == 'pdf': asset.pdf_capture = Asset.CAPTURE_STATUS_PENDING task = get_pdf else: # else, it's not a PDF. Let's try our best to retrieve what we can asset.warc_capture = Asset.CAPTURE_STATUS_PENDING task = proxy_capture asset.save() run_task( task.s(asset.link.guid, asset.link.submitted_url, asset.base_storage_path, bundle.request.META.get('HTTP_USER_AGENT', ''))) return bundle
def upload_all_to_internet_archive(): # find all links created 48-24 hours ago # include timezone start_date = timezone.now() - timedelta(days=2) end_date = timezone.now() - timedelta(days=1) links = Link.objects.filter(Q(internet_archive_upload_status='not_started') | Q(internet_archive_upload_status='failed'), creation_timestamp__range=(start_date, end_date)) for link in links: if link.can_upload_to_internet_archive(): run_task(upload_to_internet_archive.s(link_guid=link.guid))
def patch(self, request, guid, format=None): """ Update link. """ link = self.get_object_for_user_by_pk(request.user, guid) was_private = link.is_private data = request.data serializer = self.serializer_class(link, data=data, partial=True, context={'request': self.request}) if serializer.is_valid(): serializer.save() # move to new folder folder = AuthenticatedLinkListView.get_folder_from_request(request) if folder: link.move_to_folder_for_user(folder, request.user) # handle file patch uploaded_file = request.data.get('file') if uploaded_file: # delete related cdxlines and captures, delete warc (rename) link.delete_related() link.safe_delete_warc() # write new warc and capture link.write_uploaded_file(uploaded_file, cache_break=True) link.warc_size = default_storage.size(link.warc_storage_file()) link.save() # delete the link from Webrecorder and # clear the user's Webrecorder session, if any, # so that the new warc is used for this visitor's # next playback of this link. link.delete_from_wr(request) clear_wr_session(request) # update internet archive if privacy changes if 'is_private' in data and was_private != bool(data.get("is_private")) and link.is_archive_eligible(): if was_private: # link was private but has been marked public run_task(upload_to_internet_archive.s(link_guid=link.guid)) else: # link was public but has been marked private run_task(delete_from_internet_archive.s(link_guid=link.guid)) # include remaining links in response links_remaining = request.user.get_links_remaining() serializer.data['links_remaining'] = 'Infinity' if links_remaining[0] == float('inf') else links_remaining[0] serializer.data['links_remaining_period'] = links_remaining[1] return Response(serializer.data) raise ValidationError(serializer.errors)
def sync_mirror(): metadata_server = settings.UPSTREAM_SERVER['address'] manifest_url = metadata_server + reverse("mirroring:manifest") metadata = requests.get( manifest_url, headers=settings.UPSTREAM_SERVER.get('headers', {}), stream=True) for line in metadata.iter_lines(): guid = line.strip() if not Link.objects.filter(guid==guid).exists(): run_task(update_perma, link_guid=guid)
def upload_all_to_internet_archive(): # find all links created 48-24 hours ago # include timezone start_date = timezone.now() - timedelta(days=2) end_date = timezone.now() - timedelta(days=1) links = Link.objects.filter(uploaded_to_internet_archive=False, creation_timestamp__range=(start_date, end_date)) for link in links: if link.can_upload_to_internet_archive(): run_task(upload_to_internet_archive.s(link_guid=link.guid))
def obj_update(self, bundle, skip_errors=False, **kwargs): uploaded_file = bundle.data.get('file') if uploaded_file and bundle.request.method == 'PATCH': if kwargs['request']: del kwargs['request'] bundle.obj = self.obj_get(bundle=bundle, **kwargs) bundle.data["replace"] = True # delete related cdxlines and captures, delete warc (rename) self.obj_delete(bundle=bundle, **kwargs) bundle = super(LinkResource, self).obj_update( bundle, archive_timestamp=bundle.obj.archive_timestamp) # if no 'folder' was supplied for this patch, set folder to current folder before calling obj_create, # since obj_create requires a folder to be included bundle.data.setdefault( 'folder', Folder.objects.accessible_to( bundle.request.user).filter(links=bundle.obj).first()) bundle = self.obj_create(bundle=bundle, **kwargs) else: is_private = bundle.obj.is_private bundle = super(LinkResource, self).obj_update(bundle, skip_errors, **kwargs) if bundle.data.get('folder', None): bundle.obj.move_to_folder_for_user(bundle.data['folder'], bundle.request.user) if 'is_private' in bundle.data: if bundle.obj.is_archive_eligible(): going_private = bundle.data.get("is_private") # if link was private but has been marked public if is_private and not going_private: run_task( upload_to_internet_archive.s( link_guid=bundle.obj.guid)) # if link was public but has been marked private elif not is_private and going_private: run_task( delete_from_internet_archive.s( link_guid=bundle.obj.guid)) links_remaining = bundle.request.user.get_links_remaining() bundle.data['links_remaining'] = links_remaining bundle.obj.clear_cache() return bundle
def patch(self, request, guid, format=None): """ Update link. """ link = self.get_object_for_user_by_pk(request.user, guid) was_private = link.is_private data = request.data serializer = self.serializer_class(link, data=data, partial=True, context={'request': self.request}) if serializer.is_valid(): serializer.save() # move to new folder folder = AuthenticatedLinkListView.get_folder_from_request(request) if folder: link.move_to_folder_for_user(folder, request.user) # handle file patch uploaded_file = request.data.get('file') if uploaded_file: # delete related cdxlines and captures, delete warc (rename) link.delete_related() link.safe_delete_warc() # write new warc and capture link.write_uploaded_file(uploaded_file, cache_break=True) # update internet archive if privacy changes if 'is_private' in data and was_private != bool( data.get("is_private")) and link.is_archive_eligible(): if was_private: # link was private but has been marked public run_task(upload_to_internet_archive.s(link_guid=link.guid)) else: # link was public but has been marked private run_task( delete_from_internet_archive.s(link_guid=link.guid)) # include remaining links in response links_remaining = request.user.get_links_remaining() serializer.data['links_remaining'] = links_remaining # clear out any caches that might be based on old link data link.clear_cache() return Response(serializer.data) raise ValidationError(serializer.errors)
def obj_update(self, bundle, skip_errors=False, **kwargs): was_vested = bundle.obj.vested bundle = super(LinkResource, self).obj_update(bundle, skip_errors, **kwargs) if bundle.data.get('folder', None): bundle.obj.move_to_folder_for_user(bundle.data['folder'], bundle.request.user) if not was_vested and bundle.obj.vested: if settings.UPLOAD_TO_INTERNET_ARCHIVE and bundle.obj.can_upload_to_internet_archive(): run_task(upload_to_internet_archive, link_guid=bundle.obj.guid) return bundle
def obj_delete(self, bundle, **kwargs): if not hasattr(bundle.obj, 'delete'): try: bundle.obj = self.obj_get(bundle=bundle, **kwargs) except ObjectDoesNotExist: raise NotFound("A model instance matching the provided arguments could not be found.") self.authorized_delete_detail(self.get_object_list(bundle.request), bundle) bundle.obj.safe_delete() bundle.obj.save() if bundle.obj.uploaded_to_internet_archive: run_task(delete_from_internet_archive.s(link_guid=bundle.obj.guid))
def obj_create(self, bundle, **kwargs): # We've received a request to archive a URL. That process is managed here. # We create a new entry in our datastore and pass the work off to our indexing # workers. They do their thing, updating the model as they go. When we get some minimum # set of results we can present the user (a guid for the link), we respond back. if settings.READ_ONLY_MODE: raise ImmediateHttpResponse(response=self.error_response(bundle.request, { 'archives': {'__all__': "Perma has paused archive creation for scheduled maintenance. Please try again shortly."}, 'reason': "Perma has paused archive creation for scheduled maintenance. Please try again shortly.", })) # Runs validation (exception thrown if invalid), sets properties and saves the object bundle = super(LinkResource, self).obj_create(bundle, created_by=bundle.request.user) asset = Asset(link=bundle.obj) uploaded_file = bundle.data.get('file') if uploaded_file: # normalize file name to upload.jpg, upload.png, upload.gif, or upload.pdf mime_type = get_mime_type(uploaded_file.name) file_name = 'upload.%s' % mime_type_lookup[mime_type]['new_extension'] file_path = os.path.join(asset.base_storage_path, file_name) uploaded_file.file.seek(0) file_name = default_storage.store_file(uploaded_file, file_path) if mime_type == 'application/pdf': asset.pdf_capture = file_name else: asset.image_capture = file_name asset.user_upload = True asset.user_upload_file_name = uploaded_file.name asset.save() else: asset.image_capture = Asset.CAPTURE_STATUS_PENDING # If it appears as if we're trying to archive a PDF, only run our PDF retrieval tool if asset.link.media_type == 'pdf': asset.pdf_capture = Asset.CAPTURE_STATUS_PENDING task = get_pdf else: # else, it's not a PDF. Let's try our best to retrieve what we can asset.warc_capture = Asset.CAPTURE_STATUS_PENDING task = proxy_capture asset.save() run_task(task.s(asset.link.guid, asset.link.submitted_url, asset.base_storage_path, bundle.request.META.get('HTTP_USER_AGENT', ''))) return bundle
def run_next_capture(): """ Grab and run the next CaptureJob. This will keep calling itself until there are no jobs left. """ capture_job = CaptureJob.get_next_job(reserve=True) if not capture_job: return # no jobs waiting try: proxy_capture(capture_job) except: print "Exception while processing capture job %s:" % capture_job.link_id traceback.print_exc() finally: capture_job.link.captures.filter(status='pending').update(status='failed') if capture_job.status == 'pending': capture_job.mark_completed('failed') run_task(run_next_capture.s())
def obj_update(self, bundle, skip_errors=False, **kwargs): was_vested = bundle.obj.vested bundle = super(LinkResource, self).obj_update(bundle, skip_errors, **kwargs) if bundle.data.get('folder', None): bundle.obj.move_to_folder_for_user(bundle.data['folder'], bundle.request.user) if not was_vested and bundle.obj.vested: if settings.UPLOAD_TO_INTERNET_ARCHIVE and bundle.obj.can_upload_to_internet_archive( ): run_task(upload_to_internet_archive, link_guid=bundle.obj.guid) return bundle
def run_next_capture(): """ Grab and run the next CaptureJob. This will keep calling itself until there are no jobs left. """ capture_job = CaptureJob.get_next_job(reserve=True) if not capture_job: return # no jobs waiting try: proxy_capture(capture_job) except: print "Exception while processing capture job %s:" % capture_job.link_id traceback.print_exc() finally: capture_job.link.captures.filter(status='pending').update( status='failed') if capture_job.status == 'pending': capture_job.mark_completed('failed') run_task(run_next_capture.s())
def obj_update(self, bundle, skip_errors=False, **kwargs): uploaded_file = bundle.data.get('file') if uploaded_file and bundle.request.method == 'PATCH': if kwargs['request']: del kwargs['request'] bundle.obj = self.obj_get(bundle=bundle, **kwargs) bundle.data["replace"]=True # delete related cdxlines and captures, delete warc (rename) self.obj_delete(bundle=bundle, **kwargs) bundle = super(LinkResource, self).obj_update(bundle, archive_timestamp=bundle.obj.archive_timestamp) # if no 'folder' was supplied for this patch, set folder to current folder before calling obj_create, # since obj_create requires a folder to be included bundle.data.setdefault('folder', Folder.objects.accessible_to(bundle.request.user).filter(links=bundle.obj).first()) bundle = self.obj_create(bundle=bundle, **kwargs) else: is_private = bundle.obj.is_private bundle = super(LinkResource, self).obj_update(bundle, skip_errors, **kwargs) if bundle.data.get('folder', None): bundle.obj.move_to_folder_for_user(bundle.data['folder'], bundle.request.user) if 'is_private' in bundle.data: if bundle.obj.is_archive_eligible(): going_private = bundle.data.get("is_private") # if link was private but has been marked public if is_private and not going_private: run_task(upload_to_internet_archive.s(link_guid=bundle.obj.guid)) # if link was public but has been marked private elif not is_private and going_private: run_task(delete_from_internet_archive.s(link_guid=bundle.obj.guid)) links_remaining = bundle.request.user.get_links_remaining() bundle.data['links_remaining'] = links_remaining bundle.obj.clear_cache() return bundle
def obj_delete(self, bundle, **kwargs): if not hasattr(bundle.obj, 'delete'): try: bundle.obj = self.obj_get(bundle=bundle, **kwargs) except ObjectDoesNotExist: raise NotFound("A model instance matching the provided arguments could not be found.") self.authorized_delete_detail(self.get_object_list(bundle.request), bundle) # deleting related captures and cdxlines bundle.obj.delete_related() # if replacing file, only "delete" warc by renaming if bundle.data.get("replace"): bundle.obj.safe_delete_warc() else: bundle.obj.safe_delete() bundle.obj.save() if bundle.obj.internet_archive_upload_status == 'completed': run_task(delete_from_internet_archive.s(link_guid=bundle.obj.guid))
def obj_update(self, bundle, skip_errors=False, **kwargs): is_private = bundle.obj.is_private bundle = super(LinkResource, self).obj_update(bundle, skip_errors, **kwargs) if bundle.data.get('folder', None): bundle.obj.move_to_folder_for_user(bundle.data['folder'], bundle.request.user) if 'is_private' in bundle.data: if bundle.obj.is_archive_eligible(): going_private = bundle.data.get("is_private") # if link was private but has been marked public if is_private and not going_private: run_task(upload_to_internet_archive.s(link_guid=bundle.obj.guid)) # if link was public but has been marked private elif not is_private and going_private: run_task(delete_from_internet_archive.s(link_guid=bundle.obj.guid)) links_remaining = bundle.request.user.get_links_remaining() bundle.data['links_remaining'] = links_remaining return bundle
def obj_create(self, bundle, **kwargs): # We've received a request to archive a URL. That process is managed here. # We create a new entry in our datastore and pass the work off to our indexing # workers. They do their thing, updating the model as they go. When we get some minimum # set of results we can present the user (a guid for the link), we respond back. if settings.READ_ONLY_MODE: raise ImmediateHttpResponse( response=self.error_response( bundle.request, { "archives": { "__all__": "Perma has paused archive creation for scheduled maintenance. Please try again shortly." }, "reason": "Perma has paused archive creation for scheduled maintenance. Please try again shortly.", }, ) ) # Runs validation (exception thrown if invalid), sets properties and saves the object bundle = super(LinkResource, self).obj_create(bundle, created_by=bundle.request.user) link = bundle.obj uploaded_file = bundle.data.get("file") if uploaded_file: # normalize file name to upload.jpg, upload.png, upload.gif, or upload.pdf mime_type = get_mime_type(uploaded_file.name) file_name = "upload.%s" % mime_type_lookup[mime_type]["new_extension"] warc_url = "file:///%s/%s" % (link.guid, file_name) capture = Capture( link=link, role="primary", status="success", record_type="resource", user_upload="True", content_type=mime_type, url=warc_url, ) uploaded_file.file.seek(0) capture.write_warc_resource_record(uploaded_file) capture.save() else: # create primary capture placeholder Capture(link=link, role="primary", status="pending", record_type="response", url=link.submitted_url).save() # create screenshot placeholder Capture( link=link, role="screenshot", status="pending", record_type="resource", url="file:///%s/cap.png" % link.guid, content_type="image/png", ).save() # kick off capture task run_task(proxy_capture.s(link.guid, bundle.request.META.get("HTTP_USER_AGENT", ""))) return bundle
def obj_create(self, bundle, **kwargs): # We've received a request to archive a URL. That process is managed here. # We create a new entry in our datastore and pass the work off to our indexing # workers. They do their thing, updating the model as they go. When we get some minimum # set of results we can present the user (a guid for the link), we respond back. if settings.READ_ONLY_MODE: raise ImmediateHttpResponse(response=self.error_response( bundle.request, { 'archives': { '__all__': "Perma has paused archive creation for scheduled maintenance. Please try again shortly." }, 'reason': "Perma has paused archive creation for scheduled maintenance. Please try again shortly.", })) # Runs validation (exception thrown if invalid), sets properties and saves the object if not bundle.data.get('replace'): bundle = super(LinkResource, self).obj_create(bundle, created_by=bundle.request.user) link = bundle.obj link.save() # put link in folder and handle Org settings based on folder folder = bundle.data.get('folder') if folder.organization and folder.organization.default_to_private: link.is_private = True link.save() link.move_to_folder_for_user( folder, bundle.request.user) # also sets link.organization uploaded_file = bundle.data.get('file') if uploaded_file: # normalize file name to upload.jpg, upload.png, upload.gif, or upload.pdf mime_type = get_mime_type(uploaded_file.name) file_name = 'upload.%s' % mime_type_lookup[mime_type][ 'new_extension'] base_warc_url = "file:///%s/%s" % (link.guid, file_name) # only append a random number to warc_url if we're replacing a file warc_url = base_warc_url if not bundle.data.get( 'replace') else "%s?version=%s" % ( base_warc_url, str(random.random()).replace('.', '')) capture = Capture(link=link, role='primary', status='success', record_type='resource', user_upload='True', content_type=mime_type, url=warc_url) uploaded_file.file.seek(0) capture.write_warc_resource_record(uploaded_file) capture.save() else: # create primary capture placeholder Capture( link=link, role='primary', status='pending', record_type='response', url=link.submitted_url, ).save() # create screenshot placeholder Capture( link=link, role='screenshot', status='pending', record_type='resource', url="file:///%s/cap.png" % link.guid, content_type='image/png', ).save() # create CaptureJob CaptureJob(link=link, human=bundle.data.get('human', False)).save() # kick off capture tasks -- no need for guid since it'll work through the queue run_task(run_next_capture.s()) return bundle
def post(self, request, format=None): """ Create new link. """ data = request.data capture_job = CaptureJob(human=request.data.get('human', False), submitted_url=request.data.get('url', ''), created_by=request.user) if settings.ENABLE_BATCH_LINKS: # Batch is set directly on the request object by the LinkBatch api, # to prevent abuse of this feature by those POSTing directly to this route. if getattr(request, 'batch', None): capture_job.link_batch = LinkBatch.objects.get( id=request.batch) capture_job.save() # Set target folder, in order of preference: # - 'folder' key in data # - parent folder, if posting to /folders/:parent_id/archives # - user's personal folder try: folder = self.get_folder_from_request( request) or request.parent or request.user.root_folder except ValidationError as e: raise_invalid_capture_job(capture_job, e.detail) # Make sure a limited user has links left to create if not folder.organization: if not request.user.link_creation_allowed(): if request.user.nonpaying: raise_invalid_capture_job( capture_job, "You've already reached your limit.") error = "Perma.cc cannot presently make additional Perma Links on your behalf. Visit your subscription settings page for more information." raise_invalid_capture_job(capture_job, error) else: registrar = folder.organization.registrar if not registrar.link_creation_allowed(): error = 'Perma.cc cannot presently make links on behalf of {}. '.format( registrar.name) if request.user.registrar: contact = 'Visit your settings for subscription information.' else: registrar_users = [ user.email for user in registrar.active_registrar_users() ] contact = 'For assistance with your subscription, contact: {}.'.format( ", ".join(registrar_users)) raise_invalid_capture_job(capture_job, error + contact) serializer = self.serializer_class(data=data, context={'request': request}) if serializer.is_valid(): link = serializer.save(created_by=request.user) # put link in folder and handle Org settings based on folder if folder.organization and folder.organization.default_to_private: link.is_private = True link.save() link.move_to_folder_for_user( folder, request.user) # also sets link.organization # handle uploaded file uploaded_file = request.data.get('file') if uploaded_file: link.write_uploaded_file(uploaded_file) link.warc_size = default_storage.size(link.warc_storage_file()) link.save() # handle submitted url else: # create primary capture placeholder Capture( link=link, role='primary', status='pending', record_type='response', url=link.submitted_url, ).save() # create screenshot placeholder Capture( link=link, role='screenshot', status='pending', record_type='resource', url="file:///%s/cap.png" % link.guid, content_type='image/png', ).save() # kick off capture tasks -- no need for guid since it'll work through the queue capture_job.status = 'pending' capture_job.link = link capture_job.save(update_fields=['status', 'link']) run_task(run_next_capture.s()) return Response(serializer.data, status=status.HTTP_201_CREATED) raise_invalid_capture_job(capture_job, serializer.errors)
def do_update_perma(request, guid): run_task(update_perma, link_guid=guid) return HttpResponse("OK")
def obj_create(self, bundle, **kwargs): # We've received a request to archive a URL. That process is managed here. # We create a new entry in our datastore and pass the work off to our indexing # workers. They do their thing, updating the model as they go. When we get some minimum # set of results we can present the user (a guid for the link), we respond back. if settings.READ_ONLY_MODE: raise ImmediateHttpResponse(response=self.error_response(bundle.request, { 'archives': {'__all__': "Perma has paused archive creation for scheduled maintenance. Please try again shortly."}, 'reason': "Perma has paused archive creation for scheduled maintenance. Please try again shortly.", })) # Runs validation (exception thrown if invalid), sets properties and saves the object if not bundle.data.get('replace'): bundle = super(LinkResource, self).obj_create(bundle, created_by=bundle.request.user) link = bundle.obj link.save() # put link in folder and handle Org settings based on folder folder = bundle.data.get('folder') if folder.organization and folder.organization.default_to_private: link.is_private = True link.save() link.move_to_folder_for_user(folder, bundle.request.user) # also sets link.organization uploaded_file = bundle.data.get('file') if uploaded_file: # normalize file name to upload.jpg, upload.png, upload.gif, or upload.pdf mime_type = get_mime_type(uploaded_file.name) file_name = 'upload.%s' % mime_type_lookup[mime_type]['new_extension'] base_warc_url = "file:///%s/%s" % (link.guid, file_name) # only append a random number to warc_url if we're replacing a file warc_url = base_warc_url if not bundle.data.get('replace') else "%s?version=%s" % (base_warc_url, str(random.random()).replace('.','')) capture = Capture(link=link, role='primary', status='success', record_type='resource', user_upload='True', content_type=mime_type, url=warc_url) uploaded_file.file.seek(0) capture.write_warc_resource_record(uploaded_file) capture.save() else: # create primary capture placeholder Capture( link=link, role='primary', status='pending', record_type='response', url=link.submitted_url, ).save() # create screenshot placeholder Capture( link=link, role='screenshot', status='pending', record_type='resource', url="file:///%s/cap.png" % link.guid, content_type='image/png', ).save() # create CaptureJob CaptureJob(link=link, human=bundle.data.get('human', False)).save() # kick off capture tasks -- no need for guid since it'll work through the queue run_task(run_next_capture.s()) return bundle
def post(self, request, format=None): """ Create new link. """ data = request.data # Set target folder, in order of preference: # - 'folder' key in data # - parent folder, if posting to /folders/:parent_id/archives # - user's personal folder folder = self.get_folder_from_request(request) or request.parent or request.user.root_folder # Make sure a limited user has links left to create if not folder.organization: links_remaining = request.user.get_links_remaining() if links_remaining < 1: raise_validation_error("You've already reached your limit.") else: registrar = folder.organization.registrar if not registrar.link_creation_allowed(): error = 'Perma.cc cannot presently make links on behalf of {}. '.format(registrar.name) if request.user.registrar: contact = 'Visit your settings for subscription information.' else: registrar_users = [user.email for user in registrar.active_registrar_users()] contact = 'For assistance with your subscription, contact: {}.'.format(", ".join(registrar_users)) raise_validation_error(error + contact) serializer = self.serializer_class(data=data, context={'request': request}) if serializer.is_valid(): link = serializer.save(created_by=request.user) # put link in folder and handle Org settings based on folder if folder.organization and folder.organization.default_to_private: link.is_private = True link.save() link.move_to_folder_for_user(folder, request.user) # also sets link.organization # handle uploaded file uploaded_file = request.data.get('file') if uploaded_file: link.write_uploaded_file(uploaded_file) # handle submitted url else: # create primary capture placeholder Capture( link=link, role='primary', status='pending', record_type='response', url=link.submitted_url, ).save() # create screenshot placeholder Capture( link=link, role='screenshot', status='pending', record_type='resource', url="file:///%s/cap.png" % link.guid, content_type='image/png', ).save() # create CaptureJob CaptureJob(link=link, human=request.data.get('human', False)).save() # kick off capture tasks -- no need for guid since it'll work through the queue run_task(run_next_capture.s()) return Response(serializer.data, status=status.HTTP_201_CREATED) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def post(self, request, format=None): """ Create new link. """ data = request.data capture_job = CaptureJob( human=request.data.get('human', False), submitted_url=request.data.get('url', ''), created_by=request.user ) if settings.ENABLE_BATCH_LINKS: # Batch is set directly on the request object by the LinkBatch api, # to prevent abuse of this feature by those POSTing directly to this route. if getattr(request, 'batch', None): capture_job.link_batch = LinkBatch.objects.get(id=request.batch) capture_job.save() # Set target folder, in order of preference: # - 'folder' key in data # - parent folder, if posting to /folders/:parent_id/archives # - user's personal folder try: folder = self.get_folder_from_request(request) or request.parent or request.user.root_folder except ValidationError as e: raise_invalid_capture_job(capture_job, e.detail) # Make sure a limited user has links left to create if not folder.organization: if not request.user.link_creation_allowed(): if request.user.nonpaying: raise_invalid_capture_job(capture_job, "You've already reached your limit.") error = "Perma.cc cannot presently make additional Perma Links on your behalf. Visit your subscription settings page for more information." raise_invalid_capture_job(capture_job, error) else: registrar = folder.organization.registrar if not registrar.link_creation_allowed(): error = 'Perma.cc cannot presently make links on behalf of {}. '.format(registrar.name) if request.user.registrar: contact = 'Visit your settings for subscription information.' else: registrar_users = [user.email for user in registrar.active_registrar_users()] contact = 'For assistance with your subscription, contact: {}.'.format(", ".join(registrar_users)) raise_invalid_capture_job(capture_job, error + contact) serializer = self.serializer_class(data=data, context={'request': request}) if serializer.is_valid(): link = serializer.save(created_by=request.user) # put link in folder and handle Org settings based on folder if folder.organization and folder.organization.default_to_private: link.is_private = True link.save() link.move_to_folder_for_user(folder, request.user) # also sets link.organization # handle uploaded file uploaded_file = request.data.get('file') if uploaded_file: link.write_uploaded_file(uploaded_file) # handle submitted url else: # create primary capture placeholder Capture( link=link, role='primary', status='pending', record_type='response', url=link.submitted_url, ).save() # create screenshot placeholder Capture( link=link, role='screenshot', status='pending', record_type='resource', url="file:///%s/cap.png" % link.guid, content_type='image/png', ).save() # kick off capture tasks -- no need for guid since it'll work through the queue capture_job.status = 'pending' capture_job.link = link capture_job.save(update_fields=['status', 'link']) run_task(run_next_capture.s()) return Response(serializer.data, status=status.HTTP_201_CREATED) raise_invalid_capture_job(capture_job, serializer.errors)
def update_perma(link_guid): """ Update the vested/darchived status of a perma link, and download the assets if necessary """ # N.B. This function has two instances of downloading stuff from # the root server using a scheme that looks something like # settings.SERVER + reverse("url_pattern") # This is nice because it means we don't have to repeat our URL # patterns from urls.py, but it hardcodes the fact that the root # server is another Perma instance. It's unclear to me which is a # better fact to abstract, but this is easier for now. ## First, let's get the metadata for this link. The metadata ## contains information about where we should place the assets (if ## we decide that we need them). This is also a fast check to make ## sure the link GUID is actually real. metadata_server = settings.UPSTREAM_SERVER['address'] metadata_url = metadata_server + reverse("service_link_status", args=(link_guid,)) metadata = requests.get( metadata_url, headers=settings.UPSTREAM_SERVER.get('headers', {}) ).json() ## Next, let's see if we need to get the assets. If we have the ## Link object for this GUID, we're going to assume we already ## have what we need. It would make a little more sense to use the ## Asset object here instead, but we're definitely going to need ## to do stuff to the Link object so we might as well get that ## instead. In practice they should be ~one to one. try: link = Link.objects.get(guid=link_guid) except Link.DoesNotExist: ## We need to download the assets. We can download an archive ## from the assets server. assets_server = settings.UPSTREAM_SERVER['address'] assets_url = assets_server + reverse("mirroring:link_assets", args=(link_guid,)) # Temp paths can be relative because we're in run_in_tempdir() temp_zip_path = 'temp.zip' # Save remote zip file to disk, using streaming to avoid keeping large files in RAM. request = requests.get( assets_url, headers=settings.UPSTREAM_SERVER.get('headers', {}), stream=True) with open(temp_zip_path, 'wb') as f: for chunk in request.iter_content(1024): f.write(chunk) ## Extract the archive and change into the extracted folder. with zipfile.ZipFile(temp_zip_path, "r") as zipfh: #assets_path = os.path.dirname(os.path.join(settings.MEDIA_ROOT, metadata["path"])) zipfh.extractall() # creates folder named [guid] in current temp dir temp_extracted_path = os.path.basename(metadata['path']) # e.g. "1234-ABCD" # Save all extracted files to default_storage, using the path in metadata. for root, dirs, files in os.walk(temp_extracted_path): for file in files: source_file_path = os.path.join(root, file) # e.g. "1234-ABCD/cap.png" dest_file_path = os.path.join(os.path.dirname(metadata['path']), source_file_path) # e.g. 2014/6/10/18/37/1234-ABCD/cap.png with open(source_file_path, 'rb') as source_file: default_storage.store_file(source_file, dest_file_path) ## We can now get some additional metadata that we'll need to ## create the Link object. with open(os.path.join(temp_extracted_path, "metadata.json"), "r") as fh: link_metadata = json.load(fh) ## We now have everything we need to initialize the Link object. link = Link(guid=link_guid) link.submitted_url = link_metadata["submitted_url"] link.submitted_title = link_metadata["submitted_title"] link.created_by = None # XXX maybe we should do something with FakeUser here link.save(pregenerated_guid=True) # We need to save this so that we can create an Asset object # This is a stupid hack to overcome the fact that the Link has # auto_now_add=True, so it's always going to be saved to the # current time on first creation. link.creation_timestamp = unserialize_datetime(link_metadata["creation_timestamp"]) link.save() ## Lastly, let's create an Asset object for this Link. asset = Asset(link=link) asset.base_storage_path = metadata["path"] asset.image_capture = metadata["image_capture"] asset.warc_capture = metadata["source_capture"] asset.pdf_capture = metadata["pdf_capture"] asset.text_capture = metadata["text_capture"] asset.save() ## We can now add some of the data we got from the metadata to the Link object link.dark_archived = metadata["dark_archived"] link.vested = metadata["vested"] link.save() # If we have sub-mirrors, poke them to get a copy from us. if settings.DOWNSTREAM_SERVERS: run_task(poke_mirrors, link_guid=link_guid)
def post(self, request, format=None): """ Create new link. """ data = request.data # Set target folder, in order of preference: # - 'folder' key in data # - parent folder, if posting to /folders/:parent_id/archives # - user's personal folder folder = self.get_folder_from_request( request) or request.parent or request.user.root_folder # Make sure a limited user has links left to create if not folder.organization: links_remaining = request.user.get_links_remaining() if links_remaining < 1: raise_validation_error("You've already reached your limit.") else: registrar = folder.organization.registrar if not registrar.link_creation_allowed(): error = 'Perma.cc cannot presently make links on behalf of {}. '.format( registrar.name) if request.user.registrar: contact = 'Visit your settings for subscription information.' else: registrar_users = [ user.email for user in registrar.active_registrar_users() ] contact = 'For assistance with your subscription, contact: {}.'.format( ", ".join(registrar_users)) raise_validation_error(error + contact) serializer = self.serializer_class(data=data, context={'request': request}) if serializer.is_valid(): link = serializer.save(created_by=request.user) # put link in folder and handle Org settings based on folder if folder.organization and folder.organization.default_to_private: link.is_private = True link.save() link.move_to_folder_for_user( folder, request.user) # also sets link.organization # handle uploaded file uploaded_file = request.data.get('file') if uploaded_file: link.write_uploaded_file(uploaded_file) # handle submitted url else: # create primary capture placeholder Capture( link=link, role='primary', status='pending', record_type='response', url=link.submitted_url, ).save() # create screenshot placeholder Capture( link=link, role='screenshot', status='pending', record_type='resource', url="file:///%s/cap.png" % link.guid, content_type='image/png', ).save() # create CaptureJob CaptureJob(link=link, human=request.data.get('human', False)).save() # kick off capture tasks -- no need for guid since it'll work through the queue run_task(run_next_capture.s()) return Response(serializer.data, status=status.HTTP_201_CREATED) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def obj_create(self, bundle, **kwargs): # We've received a request to archive a URL. That process is managed here. # We create a new entry in our datastore and pass the work off to our indexing # workers. They do their thing, updating the model as they go. When we get some minimum # set of results we can present the user (a guid for the link), we respond back. if settings.READ_ONLY_MODE: raise ImmediateHttpResponse(response=self.error_response(bundle.request, { 'archives': {'__all__': "Perma has paused archive creation for scheduled maintenance. Please try again shortly."}, 'reason': "Perma has paused archive creation for scheduled maintenance. Please try again shortly.", })) # Make sure a limited user has links left to create links_remaining = bundle.request.user.get_links_remaining() if (bundle.request.user.has_limit() or not bundle.data.get('organization')) and links_remaining < 1: raise ImmediateHttpResponse(response=self.error_response(bundle.request, { 'archives': {'__all__': "You've already reached your limit."}, 'reason': "You've already reached your limit.", })) # Return the number remaining links after this one is created if bundle.request.user.has_limit() or not bundle.data.get('organization'): bundle.data['links_remaining'] = links_remaining - 1 else: bundle.data['links_remaining'] = 'unlimited' # Runs validation (exception thrown if invalid), sets properties and saves the object bundle = super(LinkResource, self).obj_create(bundle, created_by=bundle.request.user) link = bundle.obj # put link in folder and handle Org settings based on folder folder = bundle.data.get('folder') if folder: if folder.organization and folder.organization.default_to_private: link.is_private = True link.save() link.move_to_folder_for_user(folder, bundle.request.user) # also sets link.organization uploaded_file = bundle.data.get('file') if uploaded_file: # normalize file name to upload.jpg, upload.png, upload.gif, or upload.pdf mime_type = get_mime_type(uploaded_file.name) file_name = 'upload.%s' % mime_type_lookup[mime_type]['new_extension'] warc_url = "file:///%s/%s" % (link.guid, file_name) capture = Capture(link=link, role='primary', status='success', record_type='resource', user_upload='True', content_type=mime_type, url=warc_url) uploaded_file.file.seek(0) capture.write_warc_resource_record(uploaded_file) capture.save() else: # create primary capture placeholder Capture( link=link, role='primary', status='pending', record_type='response', url=link.submitted_url, ).save() # create screenshot placeholder Capture( link=link, role='screenshot', status='pending', record_type='resource', url="file:///%s/cap.png" % link.guid, content_type='image/png', ).save() # kick off capture task run_task(proxy_capture.s(link.guid, bundle.request.META.get('HTTP_USER_AGENT', ''))) return bundle
def post(self, request, format=None): """ Create new link. """ data = request.data capture_job = CaptureJob( human=request.data.get('human', False), submitted_url=request.data.get('url', ''), created_by=request.user ) if settings.ENABLE_BATCH_LINKS: # Batch is set directly on the request object by the LinkBatch api, # to prevent abuse of this feature by those POSTing directly to this route. if getattr(request, 'batch', None): capture_job.link_batch = LinkBatch.objects.get(id=request.batch) capture_job.save() # Set target folder, in order of preference: # - 'folder' key in data # - parent folder, if posting to /folders/:parent_id/archives # - user's personal folder try: folder = self.get_folder_from_request(request) or request.parent or request.user.root_folder except ValidationError as e: raise_invalid_capture_job(capture_job, e.detail) # Disallow creation of links in top-level sponsored folder if folder.is_sponsored_root_folder: error = "You can't make links directly in your Sponsored Links folder. Select a folder belonging to a sponsor." raise_invalid_capture_job(capture_job, error) # Make sure a limited user has links left to create if not folder.organization and not folder.sponsored_by: if not request.user.link_creation_allowed(): if request.user.nonpaying: raise_invalid_capture_job(capture_job, "You've already reached your limit.") error = "Perma.cc cannot presently make additional Perma Links on your behalf. Visit your subscription settings page for more information." raise_invalid_capture_job(capture_job, error) else: registrar = folder.sponsored_by if folder.sponsored_by else folder.organization.registrar msg = None if folder.read_only: registrar_users = [user.email for user in registrar.active_registrar_users()] msg = f"Your registrar has made this folder read-only. For assistance, contact: {', '.join(registrar_users)}." if not registrar.link_creation_allowed(): error = 'Perma.cc cannot presently make links on behalf of {}. '.format(registrar.name) if request.user.registrar: contact = 'Visit your settings for subscription information.' else: registrar_users = [user.email for user in registrar.active_registrar_users()] contact = 'For assistance with your subscription, contact: {}.'.format(", ".join(registrar_users)) msg = error + contact if msg: raise_invalid_capture_job(capture_job, msg) serializer = self.serializer_class(data=data, context={'request': request}) if serializer.is_valid(): with transaction.atomic(): # Technique from https://github.com/harvard-lil/capstone/blob/0f7fb80f26e753e36e0c7a6a199b8fdccdd318be/capstone/capapi/serializers.py#L121 # # Fetch the current user data here inside a transaction, using select_for_update # to lock the row so we don't collide with any simultaneous requests user = request.user.__class__.objects.select_for_update().get(pk=request.user.pk) # If this is a Personal Link, and if the user only has bonus links left, decrement bonus links bonus_link = False if not folder.organization and not folder.sponsored_by: links_remaining, _ , bonus_links = user.get_links_remaining() if bonus_links and not links_remaining: # (this works because it's part of the same transaction with the select_for_update -- # we don't have to use the same object) request.user.bonus_links = bonus_links - 1 request.user.save(update_fields=['bonus_links']) bonus_link = True link = serializer.save(created_by=request.user, bonus_link=bonus_link) # put link in folder and handle Org settings based on folder if folder.organization and folder.organization.default_to_private: link.is_private = True link.save() link.move_to_folder_for_user(folder, request.user) # also sets link.organization # handle uploaded file uploaded_file = request.data.get('file') if uploaded_file: link.write_uploaded_file(uploaded_file) # handle submitted url else: # create primary capture placeholder Capture( link=link, role='primary', status='pending', record_type='response', url=link.submitted_url, ).save() # create screenshot placeholder Capture( link=link, role='screenshot', status='pending', record_type='resource', url="file:///%s/cap.png" % link.guid, content_type='image/png', ).save() # kick off capture tasks -- no need for guid since it'll work through the queue capture_job.status = 'pending' capture_job.link = link capture_job.save(update_fields=['status', 'link']) run_task(run_next_capture.s()) return Response(serializer.data, status=status.HTTP_201_CREATED) raise_invalid_capture_job(capture_job, serializer.errors)