def largefile_ingest(request): '''Large-file ingest. On GET, displays a form allowing user to select a BagIt that has been uploaded to the configured large-file ingest staging area for ingest and association with a collection. ''' # ingest content from upload staging area context = {} template_name = 'file/largefile_ingest.html' form = None # on POST, process the form and ingest if valid if request.method == 'POST': form = LargeFileIngestForm(request.POST) # if form is not valid, add to context for redisplay with errors if not form.is_valid(): context['form'] = form # otherwise, process the form else: repo = Repository(request=request) # Get collection & check for optional comment collection = repo.get_object(pid=form.cleaned_data['collection'], type=CollectionObject) # get user comment if any; default to a generic ingest comment comment = form.cleaned_data['comment'] or 'initial repository ingest' bag = form.cleaned_data['bag'] # create dict with file info to add success/failure info file_info = {'label': os.path.basename(bag)} #assuming type of ingest from subdirectory type = bag.split('/')[-2] try: if type == 'diskimage': obj = DiskImage.init_from_bagit(bag, request) elif type == 'video': obj = Video.init_from_bagit(bag, request) # set collection on ingest obj.collection = collection ## NOTE: Due to a bug in Fedora 3.4 with checksums and ## and file uri ingest, the content datastream checksum ## must be cleared before ingest; manually check it ## after ingest to confirm Fedora calculated what we expect. ## This work-around can be removed once we upgrade to Fedora 3.6 # store datastream checksum that would be sent to fedora checksum = obj.content.checksum obj._content_checksum = checksum # clear it out so Fedora can ingest without erroring obj.content.checksum = None # file URIs also used for supplemental files; needs # to be handled the same way as content datastream # - look for any supplementN datastreams, store checksum, and remove supplemental_checksums = {} for i in range(20): try: dsid = 'supplement%d' % i dsobj = getattr(obj, dsid) supplemental_checksums[dsid] = dsobj.checksum dsobj.checksum = None except AttributeError: # stop iterating - we have found last supplemental file break # same for access copy checksum on Video files if type == 'video': access_checksum = obj.access_copy.checksum obj.access_copy.checksum = None pids_exists = [] if type == 'video': pids_exists = repo.find_objects(type=Video, label=obj.label) if type == 'diskimage': pids_exists = repo.find_objects(type=DiskImage, label=obj.label) exists = 0 for pid in pids_exists: if pid.pid: exists += 1 if exists == 0: obj.save(comment) else: raise ValueError('Duplicate content detected.') # remove the ingested bag from large-file staging area shutil.rmtree(bag) # re-init to allow checking fedora-calculated checksums on # supplemental datastreams if type == 'diskimage': obj = repo.get_object(obj.pid, type=DiskImage) elif type == 'video': obj = repo.get_object(obj.pid, type=Video) # if save succeded (no exceptions), set summary info for display file_info.update({'type' : type, 'success': True, 'pid': obj.pid, 'url': obj.get_absolute_url(), 'checksum': obj.content.checksum}) if type == 'video': file_info['access_checksum'] = obj.access_copy.checksum # compare checksum generated by Fedora # (required because of file uri bug in fedora 3.4; # this can be removed once we upgrade to fedora 3.6+) checksum_errors = [] if obj.content.checksum != checksum: checksum_errors.append('content') for dsid, checksum in supplemental_checksums.iteritems(): dsobj = obj.getDatastreamObject(dsid) if dsobj.checksum != checksum: checksum_errors.append(dsid) if type == 'video' and obj.access_copy.checksum != access_checksum: checksum_errors.append('access_copy') if checksum_errors: message = 'Checksum mismatch%s detected on ' + \ '%s datastream%s; please contact a repository administrator.''' file_info['message'] = message % ( 'es' if len(checksum_errors) > 1 else '', ', '.join(checksum_errors), 's' if len(checksum_errors) > 1 else '' ) except bagit.BagValidationError as err: logger.error(err) file_info.update({'success': False, 'message': 'BagIt error: %s' % err}) # special case: detected as duplicate content except DuplicateContent as e: # mark as failed and generate message with links to records # NOTE: pid url is duplicated logic from web upload view... links = [] for pid in e.pids: # use fedora type-inferring logic with list of content models # pulled from solr results obj = repo.get_object(pid, type=repo.best_subtype_for_object(pid, e.pid_cmodels[pid])) # use appropriate object class to get the object url links.append('<a href="%s">%s</a>' % ( obj.get_absolute_url(), pid) ) msg = mark_safe('%s: %s' % (unicode(e), '; '.join(links))) file_info.update({ 'success': False, 'message': msg }) except Exception as err: logger.error('Error: %s' % err) file_info.update({'success': False, 'message': '%s' % err}) # report success/failure in the same format as web-upload ingest context['ingest_results'] = [file_info] messages.success(request, 'Ingest results: %s' % file_info) return HttpResponseRedirect("/admin") # on GET display form to select item(s) for ingest # OR on completed valid form post files = large_file_uploads() if request.method == 'GET' or \ form is not None and form.is_valid(): if len(files): context['form'] = LargeFileIngestForm() else: # indicator that no files are available for ingest context['no_files'] = True return TemplateResponse(request, template_name, context)
def largefile_ingest(request): '''Large-file ingest. On GET, displays a form allowing user to select a BagIt that has been uploaded to the configured large-file ingest staging area for ingest and association with a collection. ''' # ingest content from upload staging area context = {} template_name = 'file/largefile_ingest.html' form = None # on POST, process the form and ingest if valid if request.method == 'POST': form = LargeFileIngestForm(request.POST) # if form is not valid, add to context for redisplay with errors if not form.is_valid(): context['form'] = form # otherwise, process the form else: repo = Repository(request=request) # Get collection & check for optional comment collection = repo.get_object(pid=form.cleaned_data['collection'], type=CollectionObject) # get user comment if any; default to a generic ingest comment comment = form.cleaned_data[ 'comment'] or 'initial repository ingest' bag = form.cleaned_data['bag'] # create dict with file info to add success/failure info file_info = {'label': os.path.basename(bag)} #assuming type of ingest from subdirectory type = bag.split('/')[-2] try: if type == 'diskimage': obj = DiskImage.init_from_bagit(bag, request) if type == 'video': obj = Video.init_from_bagit(bag, request) # set collection on ingest obj.collection = collection ## NOTE: Due to a bug in Fedora 3.4 with checksums and ## and file uri ingest, the content datastream checksum ## must be cleared before ingest; manually check it ## after ingest to confirm Fedora calculated what we expect. ## This work-around can be removed once we upgrade to Fedora 3.6 # store datastream checksum that would be sent to fedora checksum = obj.content.checksum obj._content_checksum = checksum # clear it out so Fedora can ingest without erroring obj.content.checksum = None # file URIs also used for supplemental files; needs # to be handled the same way as content datastream # - look for any supplementN datastreams, store checksum, and remove supplemental_checksums = {} for i in range(20): try: dsid = 'supplement%d' % i dsobj = getattr(obj, dsid) supplemental_checksums[dsid] = dsobj.checksum dsobj.checksum = None except AttributeError: # stop iterating - we have found last supplemental file break # same for access copy checksum on Video files if type == 'video': access_checksum = obj.access_copy.checksum obj.access_copy.checksum = None obj.save(comment) # remove the ingested bag from large-file staging area shutil.rmtree(bag) # re-init to allow checking fedora-calculated checksums on # supplemental datastreams if type == 'diskimage': obj = repo.get_object(obj.pid, type=DiskImage) elif type == 'video': obj = repo.get_object(obj.pid, type=Video) # if save succeded (no exceptions), set summary info for display file_info.update({ 'type': type, 'success': True, 'pid': obj.pid, 'url': obj.get_absolute_url(), 'checksum': obj.content.checksum }) if type == 'video': file_info['access_checksum'] = obj.access_copy.checksum # compare checksum generated by Fedora # (required because of file uri bug in fedora 3.4; # this can be removed once we upgrade to fedora 3.6+) checksum_errors = [] if obj.content.checksum != checksum: checksum_errors.append('content') for dsid, checksum in supplemental_checksums.iteritems(): dsobj = obj.getDatastreamObject(dsid) if dsobj.checksum != checksum: checksum_errors.append(dsid) if type == 'video' and obj.access_copy.checksum != access_checksum: checksum_errors.append('access_copy') if checksum_errors: message = 'Checksum mismatch%s detected on ' + \ '%s datastream%s; please contact a repository administrator.''' file_info['message'] = message % ( 'es' if len(checksum_errors) > 1 else '', ', '.join(checksum_errors), 's' if len(checksum_errors) > 1 else '') except bagit.BagValidationError as err: logger.error(err) file_info.update({ 'success': False, 'message': 'BagIt error: %s' % err }) # special case: detected as duplicate content except DuplicateContent as e: # mark as failed and generate message with links to records # NOTE: pid url is duplicated logic from web upload view... links = [] for pid in e.pids: # use fedora type-inferring logic with list of content models # pulled from solr results obj = repo.get_object(pid, type=repo.best_subtype_for_object( pid, e.pid_cmodels[pid])) # use appropriate object class to get the object url links.append('<a href="%s">%s</a>' % (obj.get_absolute_url(), pid)) msg = mark_safe('%s: %s' % (unicode(e), '; '.join(links))) file_info.update({'success': False, 'message': msg}) except Exception as err: logger.error('Error: %s' % err) file_info.update({'success': False, 'message': '%s' % err}) # report success/failure in the same format as web-upload ingest context['ingest_results'] = [file_info] # on GET display form to select item(s) for ingest # OR on completed valid form post files = large_file_uploads() if request.method == 'GET' or \ form is not None and form.is_valid(): if len(files): context['form'] = LargeFileIngestForm() else: # indicator that no files are available for ingest context['no_files'] = True return TemplateResponse(request, template_name, context)
def ingest_files(files, collection, comment, request): '''Ingest a dictionary of files as returned by :meth:`keep.files.forms.UploadForm.files_to_ingest`. Returns a dictionary reporting per-file ingest success or failure. :param files: dictionary of files to be ingested :param collection: :class:`~keep.collection.models.CollectionObject` that newly ingested objects should be associated with :param comment: save message for fedora ingest :param request: :class:`~django.http.HttpRequest`, to access Fedora and ingest new objects as the logged-in user. ''' # NOTE: using this structure for easy of display in django templates (e.g., regroup) results = [] m = magic.Magic(mime=True) for filename, label in files.iteritems(): file_info = {'label': label} # check if file is an allowed type # NOTE: for single-file upload, browser-set type is # available as UploadedFile.content_type - but since # browser mimetypes are unreliable, calculate anyway try: type = m.from_file(filename) except IOError: raise Exception('Uploaded file is no longer available for ingest; please try again.') type, separator, options = type.partition(';') if type not in allowed_upload_types(request.user): # store error for display on detailed result page file_info.update({'success': False, 'message': '''File type '%s' is not allowed''' % type}) # if not an allowed type, no further processing results.append(file_info) continue if collection is None: file_info.update({'success': False, 'message': '''Collection not selected'''}) results.append(file_info) continue # if there is an MD5 file (i.e., file was uploaded via ajax), # use the contents of that file as checksum if os.path.exists(filename + '.md5'): with open(filename + '.md5') as md5file: md5 = md5file.read() # otherwise, calculate the MD5 (single-file upload) else: md5 = md5sum(filename) # determine what type of object to initialize based on mimetype objtype = None for t in uploadable_objects: if type in t.allowed_mimetypes: objtype = t break # initialize a new object from the file obj = objtype.init_from_file(filename, initial_label=label, request=request, checksum=md5, mimetype=type) # set collection on ingest obj.collection = collection try: # NOTE: by sending a log message, we force Fedora to store an # audit trail entry for object creation, which doesn't happen otherwise obj.save(comment) file_info.update({'success': True, 'pid': obj.pid, 'url': obj.get_absolute_url(), 'checksum': md5}) # if audio, needs an additional step: if objtype == AudioObject: # Start asynchronous task to convert audio for access # NOTE: not passing in user-upload file so that # celery can more easily be run on a separate server queue_access_copy(obj) # remove the file now that we have sucessfully ingested os.remove(filename) # NOTE: could remove MD5 file (if any) here, but MD5 files # should be small and will get cleaned up by the cron script # special case: detected as duplicate content except DuplicateContent as e: # mark as failed and generate message with links to records links = [] repo = Repository(request=request) for pid in e.pids: # use fedora type-inferring logic with list of content models # pulled from solr results obj = repo.get_object(pid, type=repo.best_subtype_for_object(pid, e.pid_cmodels[pid])) # use appropriate object class to get the object url links.append('<a href="%s">%s</a>' % ( obj.get_absolute_url(), pid) ) msg = mark_safe('%s: %s' % (unicode(e), '; '.join(links))) file_info.update({ 'success': False, 'message': msg }) except Exception as e: logger.error('Error ingesting %s: %s' % (filename, e)) logger.debug("Error details:\n" + traceback.format_exc()) file_info['success'] = False # check for Fedora-specific errors if isinstance(e, RequestFailed): if 'Checksum Mismatch' in e.detail: file_info['message'] = 'Ingest failed due to a checksum mismatch - ' + \ 'file may have been corrupted or incompletely uploaded to Fedora' else: file_info['message'] = 'Fedora error: ' + unicode(e) # non-fedora error else: file_info['message'] = 'Ingest failed: ' + unicode(e) finally: # no matter what happened, store results for reporting to user results.append(file_info) return results
def ingest_files(files, collection, comment, request): '''Ingest a dictionary of files as returned by :meth:`keep.files.forms.UploadForm.files_to_ingest`. Returns a dictionary reporting per-file ingest success or failure. :param files: dictionary of files to be ingested :param collection: :class:`~keep.collection.models.CollectionObject` that newly ingested objects should be associated with :param comment: save message for fedora ingest :param request: :class:`~django.http.HttpRequest`, to access Fedora and ingest new objects as the logged-in user. ''' # NOTE: using this structure for easy of display in django templates (e.g., regroup) results = [] m = magic.Magic(mime=True) for filename, label in files.iteritems(): file_info = {'label': label} # check if file is an allowed type # NOTE: for single-file upload, browser-set type is # available as UploadedFile.content_type - but since # browser mimetypes are unreliable, calculate anyway try: type = m.from_file(filename) except IOError: raise Exception( 'Uploaded file is no longer available for ingest; please try again.' ) type, separator, options = type.partition(';') if type not in allowed_upload_types(request.user): # store error for display on detailed result page file_info.update({ 'success': False, 'message': '''File type '%s' is not allowed''' % type }) # if not an allowed type, no further processing results.append(file_info) continue if collection is None: file_info.update({ 'success': False, 'message': '''Collection not selected''' }) results.append(file_info) continue # if there is an MD5 file (i.e., file was uploaded via ajax), # use the contents of that file as checksum if os.path.exists(filename + '.md5'): with open(filename + '.md5') as md5file: md5 = md5file.read() # otherwise, calculate the MD5 (single-file upload) else: md5 = md5sum(filename) # determine what type of object to initialize based on mimetype objtype = None for t in uploadable_objects: if type in t.allowed_mimetypes: objtype = t break # initialize a new object from the file obj = objtype.init_from_file(filename, initial_label=label, request=request, checksum=md5, mimetype=type) # set collection on ingest obj.collection = collection try: # NOTE: by sending a log message, we force Fedora to store an # audit trail entry for object creation, which doesn't happen otherwise obj.save(comment) file_info.update({ 'success': True, 'pid': obj.pid, 'url': obj.get_absolute_url(), 'checksum': md5 }) # if audio, needs an additional step: if objtype == AudioObject: # Start asynchronous task to convert audio for access # NOTE: not passing in user-upload file so that # celery can more easily be run on a separate server queue_access_copy(obj) # remove the file now that we have sucessfully ingested os.remove(filename) # NOTE: could remove MD5 file (if any) here, but MD5 files # should be small and will get cleaned up by the cron script # special case: detected as duplicate content except DuplicateContent as e: # mark as failed and generate message with links to records links = [] repo = Repository(request=request) for pid in e.pids: # use fedora type-inferring logic with list of content models # pulled from solr results obj = repo.get_object(pid, type=repo.best_subtype_for_object( pid, e.pid_cmodels[pid])) # use appropriate object class to get the object url links.append('<a href="%s">%s</a>' % (obj.get_absolute_url(), pid)) msg = mark_safe('%s: %s' % (unicode(e), '; '.join(links))) file_info.update({'success': False, 'message': msg}) except Exception as e: logger.error('Error ingesting %s: %s' % (filename, e)) logger.debug("Error details:\n" + traceback.format_exc()) file_info['success'] = False # check for Fedora-specific errors if isinstance(e, RequestFailed): if 'Checksum Mismatch' in e.detail: file_info['message'] = 'Ingest failed due to a checksum mismatch - ' + \ 'file may have been corrupted or incompletely uploaded to Fedora' else: file_info['message'] = 'Fedora error: ' + unicode(e) # non-fedora error else: file_info['message'] = 'Ingest failed: ' + unicode(e) finally: # no matter what happened, store results for reporting to user results.append(file_info) return results