def handle(self, *args, **options): self.options = options self.repaired_count = 0 self.unrepaired_count = 0 repo = Repository() self.pidman = DjangoPidmanRestClient() # populate list of objects to be processed objects = [] for pid in args: try: obj = repo.get_object(pid=pid, type=CollectionObject) if obj.has_requisite_content_models: objects.append(obj) else: obj = repo.get_object(pid=pid, type=AudioObject) if obj.has_requisite_content_models: objects.append(obj) except Exception: self.log(message="Could not find Collection or Audio object for: %s" % pid) # get list of all collections from the repository # limited to the COLLECTION_CONTENT_MODEL as well as returns a Keep specific collection object if not args: objects = repo.get_objects_with_cmodel(CollectionObject.COLLECTION_CONTENT_MODEL, type=CollectionObject) if not objects: self.log(message="No Collections were found.") for obj in objects: self.repair_ark(obj) self.log(message="\n\n%s ARKs repaired\n%s ARKs were not repaired" % (self.repaired_count, self.unrepaired_count), no_label=True)
def englishdocs_collection(): repo = Repository() obj = repo.get_object(type=CollectionObject) obj.label = 'English documents collection' obj.mods.content.title = 'English documents collection' obj.mods.content.source_id = '309' obj.collection = repo.get_object(FedoraFixtures.archives()[1].uri) obj.mods.content.create_origin_info() obj.mods.content.origin_info.created.append(mods.DateCreated(date=1509, point='start')) obj.mods.content.origin_info.created.append(mods.DateCreated(date=1805, point='end')) return obj
def rushdie_collection(): repo = Repository() obj = repo.get_object(type=CollectionObject) obj.label = 'Salman Rushdie Collection' obj.mods.content.title = 'Salman Rushdie Collection' obj.mods.content.source_id = '1000' obj.collection = repo.get_object(FedoraFixtures.archives()[1].uri) obj.mods.content.create_origin_info() obj.mods.content.origin_info.created.append(mods.DateCreated(date=1947, point='start')) obj.mods.content.origin_info.created.append(mods.DateCreated(date=2008, point='end')) obj.mods.content.create_name() obj.mods.content.name.name_parts.append(mods.NamePart(text='Salman Rushdie')) return obj
def esterbrook_collection(): repo = Repository() obj = repo.get_object(type=CollectionObject) obj.label = 'Thomas Esterbrook letter books' obj.mods.content.title = 'Thomas Esterbrook letter books' obj.mods.content.source_id = '123' obj.collection = repo.get_object(FedoraFixtures.archives()[2].uri) obj.mods.content.create_origin_info() obj.mods.content.origin_info.created.append(mods.DateCreated(date=1855, point='start')) obj.mods.content.origin_info.created.append(mods.DateCreated(date=1861, point='end')) obj.mods.content.create_name() obj.mods.content.name.name_parts.append(mods.NamePart(text='Thomas Esterbrook')) return obj
def view(request, pid): '''View a single :class:`~keep.video.models.Video`. User must either have general view video permissions, or if they have view researcher view, the object must be researcher accessible (based on rights codes). ''' repo = Repository(request=request) obj = repo.get_object(pid=pid, type=Video) # # user either needs view video permissions OR # # if they can view researcher audio and object must be researcher-accessible viewable = request.user.has_perm('video.view_video') or \ (request.user.has_perm('video.view_researcher_video') and bool(obj.researcher_access)) if not viewable: return prompt_login_or_403(request) try: if not obj.has_requisite_content_models: raise Http404 except: raise Http404 return render(request, 'video/view.html', {"resource": obj})
class Command(BaseCommand): '''Generate access copies for PIDs specified on the command line.''' help = __doc__ def handle(self, *args, **options): self.verbosity = options['verbosity'] self.repo = Repository() for pid in args: self.process_pid(pid) def process_pid(self, pid): '''Process a single PID by looking it up in the repository, figuring out what kind of processing it needs based on its object type, and doing that. ''' obj = self.repo.get_object(pid=pid, type=self.repo.infer_object_subtype) if not obj.exists: if self.verbosity >= 1: print "No such PID; skipped:", pid return if isinstance(obj, AudioObject): if self.verbosity >= 2: print "Generating audio access copy:", pid queue_access_copy(obj) else: if self.verbosity >= 1: print "Unhandled object type; skipped:", pid
def archives(format=None): if format == dict: return [{'title': nick, 'pid': pid} for nick,pid in settings.PID_ALIASES.iteritems()] if not hasattr(FedoraFixtures, '_archives'): repo = Repository() FedoraFixtures._archives = [repo.get_object(pid, type=CollectionObject) for pid in settings.PID_ALIASES.itervalues()] return FedoraFixtures._archives
def download(request, pid): 'Download disk image datastream contents' repo = Repository(request=request) obj = repo.get_object(pid, type=DiskImage) extra_headers = { 'Content-Disposition': "attachment; filename=%s.%s" % \ (obj.noid, obj.provenance.content.object.latest_format.name) } return raw_datastream(request, pid, DiskImage.content.id, repo=repo, headers=extra_headers)
def simple_collection(label=None, status=None, pid=None): repo = Repository() obj = repo.get_object(type=SimpleCollection) if label is not None: obj.label = label obj.mods.content.create_restrictions_on_access() if status is not None: obj.mods.content.restrictions_on_access.text = status if pid is not None: obj.pid = pid return obj
def handle(self, *args, **options): self.options = options self.repaired_count = 0 self.unrepaired_count = 0 repo = Repository() self.pidman = DjangoPidmanRestClient() # populate list of objects to be processed objects = [] for pid in args: try: obj = repo.get_object(pid=pid, type=CollectionObject) if obj.has_requisite_content_models: objects.append(obj) else: obj = repo.get_object(pid=pid, type=AudioObject) if obj.has_requisite_content_models: objects.append(obj) except Exception: self.log( message="Could not find Collection or Audio object for: %s" % pid) # get list of all collections from the repository # limited to the COLLECTION_CONTENT_MODEL as well as returns a Keep specific collection object if not args: objects = repo.get_objects_with_cmodel( CollectionObject.COLLECTION_CONTENT_MODEL, type=CollectionObject) if not objects: self.log(message="No Collections were found.") for obj in objects: self.repair_ark(obj) self.log(message="\n\n%s ARKs repaired\n%s ARKs were not repaired" % (self.repaired_count, self.unrepaired_count), no_label=True)
def _objects_by_type(type_uri, type=None): """ Returns a list of objects with the specified type_uri as objects of the specified type :param type_uri: The uri of the type being searched :param type: The type of object that should be returned """ repo = Repository() pids = repo.risearch.get_subjects(RDF.type, type_uri) pids_list = list(pids) for pid in pids_list: yield repo.get_object(pid=pid, type=type)
def archives(format=None): if format == dict: return [{ 'title': nick, 'pid': pid } for nick, pid in settings.PID_ALIASES.iteritems()] if not hasattr(FedoraFixtures, '_archives'): repo = Repository() FedoraFixtures._archives = [ repo.get_object(pid, type=CollectionObject) for pid in settings.PID_ALIASES.itervalues() ] return FedoraFixtures._archives
def tasks(request, pid): '''Manage tasks associated with an :class:`~keep.audio.models.AudioObject`. Currently, the only supported functionality is to queue access copy conversion; this should be done by POSTing the type of task to be queued, i.e. **generate access copy**. Supported tasks: * **generate access copy** - queue access copy conversion for an audio item by pid. Returns a status message as the body of a plain/text response :param pid: the pid of the object for which tasks should be queued ''' if request.method == 'POST': status = "queued" task_type = request.POST.get('task', None) # TODO May want to prevent queuing of more than one at a time or within a time period. # TODO For now javascript disables the link until the page is refreshed. # currently the only supported task is if task_type == 'generate access copy': try: repo = Repository(request=request) obj = repo.get_object(pid, type=AudioObject) # if object doesn't exist or isn't an audio item, 404 if not obj.exists or not obj.has_requisite_content_models: raise Http404 queue_access_copy(obj) status = 'Successfully queued access copy conversion' except Exception as err: # re-raise any 404 error if isinstance(err, Http404): raise logger.error('Error queueing access copy conversion for %s : %s' % \ (pid, err)) status = 'Error queueing access copy conversion (%s)' % err return HttpResponse(status, content_type='text/plain') # unsupported task else: return HttpResponse('Task "%s" is not supported' % task_type, content_type='text/plain', status=500)
def create_from_findingaid(request): form = FindCollection(request.POST) if not form.is_valid(): messages.error(request, 'Form is not valid; please try again.') else: data = form.cleaned_data q = CollectionObject.item_collection_query() # submitted value is pid alias; lookup pid for solr query archive_id = settings.PID_ALIASES[data['archive']] q = q.query(archive_id=archive_id, source_id=data['collection']) # if collection is found, redirect to collection view with message if q.count(): messages.info(request, 'Found %d collection%s for %s %s.' % (q.count(), 's' if q.count() != 1 else '', data['archive'].upper(), data['collection'])) return HttpResponseSeeOtherRedirect(reverse('collection:view', kwargs={'pid': q[0]['pid']})) else: # otherwise, create the new record and redirect to new # collection edit page repo = Repository(request=request) coll_id = data['collection'] coll = None try: archive = repo.get_object(archive_id, type=CollectionObject) fa = FindingAid.find_by_unitid(unicode(coll_id), archive.mods.content.title) coll = fa.generate_collection() coll.collection = archive coll.save() messages.info(request, 'Added %s for collection %s: %s' % (coll, coll_id, coll.mods.content.title)) return HttpResponseSeeOtherRedirect( reverse('collection:edit', kwargs={'pid': coll.pid})) except DoesNotExist: messages.error(request, 'No EAD found for %s in %s' % (coll_id, data['archive'].upper())) except ReturnedMultiple: messages.error(request, 'Multiple EADs found for %s in %s' % (coll_id, data['archive'].upper())) except RequestFailed as err: print err messages.error(request, 'Failed to save new collection') return HttpResponseSeeOtherRedirect(reverse('repo-admin:dashboard'))
def init_from_file(filename, initial_label=None, request=None, checksum=None, mimetype=None): '''Static method to create a new :class:`AudioObject` instance from a file. Sets the object label and metadata title based on the initial label specified, or file basename. Calculates and stores the duration based on the file. Also sets the following default metadata values: * mods:typeOfResource = "sound recording" * dt:codecQuality = "lossless" :param filename: full path to the audio file, as a string :param initial_label: optional initial label to use; if not specified, the base name of the specified file will be used :param request: :class:`django.http.HttpRequest` passed into a view method; must be passed in order to connect to Fedora as the currently-logged in user :param checksum: the checksum of the file being sent to fedora. :returns: :class:`AudioObject` initialized from the file ''' if initial_label is None: initial_label = os.path.basename(filename) repo = Repository(request=request) obj = repo.get_object(type=AudioObject) # set initial object label from the base filename obj.label = initial_label obj.dc.content.title = obj.mods.content.title = obj.label obj.audio.content = open( filename) # FIXME: at what point does/should this get closed? # Set the file checksum, if set. obj.audio.checksum = checksum # set content datastream mimetype if passed in if mimetype is not None: obj.audio.mimetype = mimetype #Get the label, minus the ".wav" (mimetype indicates that) obj.audio.label = initial_label[:-4] # set initial mods:typeOfResource - all AudioObjects default to sound recording obj.mods.content.resource_type = 'sound recording' # set codec quality to lossless in digital tech metadata # - default for AudioObjects, should only accept lossless audio for master file obj.digitaltech.content.codec_quality = 'lossless' # get wav duration and store in digital tech metadata obj.digitaltech.content.duration = '%d' % round(wav_duration(filename)) return obj
def disk_images(self): self.stderr.write('Disk images') ### disk images # representative sample of aff and ad1 # DO NOT include anything in these collections: # Trethewey (ghsdj), Rushdie (94k9k), Mackey (g1btw), # Clifton (94kf4), and Grennan (9k0st) solr = solr_interface() repo = Repository() q = solr.query(content_model=DiskImage.DISKIMAGE_CONTENT_MODEL) \ .exclude(collection_id=self.collections['trethewey']) \ .exclude(collection_id=self.collections['rushdie']) \ .exclude(collection_id=self.collections['mackey']) \ .exclude(collection_id=self.collections['clifton']) \ .exclude(collection_id=self.collections['grennan']) \ .field_limit('pid') if self.verbosity >= self.v_normal: self.stderr.write( 'Found %d disk images not in restricted collections' % q.count()) # currently there is no way to filter on format or size in either # solr or fedora risearch # so, go through individually and group them by type, # then sort by size and pick the smallest ones diskimgs_by_type = defaultdict(list) for result in q: diskimg = repo.get_object(result['pid'], type=DiskImage) if not diskimg.exists: if self.verbosity >= self.v_normal: self.stderr.write('Referenced disk image %s does not exist or is inaccessible' \ % result['pid']) continue fmt = diskimg.provenance.content.object.format.name diskimgs_by_type[fmt].append(diskimg) for fmt, diskimages in diskimgs_by_type.iteritems(): if self.verbosity >= self.v_normal: self.stderr.write('Selecting %s disk images' % fmt) # sort on binary file size so we sync the smallest ones diskimages = sorted(diskimages, key=lambda diskimg: diskimg.content.size) # use the first 10 of each type for d in diskimages[:10]: self.stdout.write(d.pid)
def view(request, pid): '''View a single :class:`~keep.collection.models.CollectionObject`, with a paginated list of all items in that collection. ''' repo = Repository(request=request) obj = repo.get_object(pid, type=CollectionObject) # if pid doesn't exist or isn't a collection, 404 if not obj.exists or not obj.has_requisite_content_models: raise Http404 # search for all items that belong to this collection q = obj.solr_items_query() q = q.sort_by('date_created') \ .sort_by('date_issued') \ .sort_by('title_exact') # filter by logged-in user permissions # (includes researcher-accessible content filter when appropriate) q = filter_by_perms(q, request.user) # if current user can only view researcher-accesible collections and # no items were found, they don't have permission to view this collection if not request.user.has_perm('collection.view_collection') and \ request.user.has_perm('collection.view_researcher_collection') and \ q.count() == 0: return prompt_login_or_403(request) # paginate the solr result set paginator = Paginator(q, 30) try: page = int(request.GET.get('page', '1')) except ValueError: page = 1 try: results = paginator.page(page) except (EmptyPage, InvalidPage): results = paginator.page(paginator.num_pages) # url parameters for pagination links url_params = request.GET.copy() if 'page' in url_params: del url_params['page'] return TemplateResponse(request, 'collection/view.html', {'collection': obj, 'items': results, 'url_params': urlencode(url_params)})
def find_by_field(field, value, repo=None): ''' Static method to find a single :class:`EmailMessage` by an indexed value. Looks for the item in Solr and returns an :class:`EmailMessage` instance initialized from the repository if a single match is found for the requested field and value. Raises :class:`django.core.exceptions.MultipleObjectsReturned` if more than one match is found; raises :class:`django.core.exceptions.ObjectDoesNotExist` if no matches are found in the Solr index. :param field: solr field to search :param value: value to search on in the specified field :param repo: optional :class:`eulfedora.server.Repository` to use an existing connection with specific credentials :returns: :class:`EmailMessage` ''' solr = solr_interface() search_terms = { field: value, 'content_model': ArrangementObject.ARRANGEMENT_CONTENT_MODEL } q = solr.query(**search_terms).field_limit('pid') # check that we found one and only one found = len(q) # borrowing custom django exceptions for not found / too many # matches if found > 1: raise MultipleObjectsReturned('Found %d records with %s %s' % \ (found, field, value)) if not found: raise ObjectDoesNotExist('No record found with %s %s' % (field, value)) if repo is None: repo = Repository() return repo.get_object(q[0]['pid'], type=EmailMessage)
def disk_images(self): self.stderr.write('Disk images') ### disk images # representative sample of aff and ad1 # DO NOT include anything in these collections: # Trethewey (ghsdj), Rushdie (94k9k), Mackey (g1btw), # Clifton (94kf4), and Grennan (9k0st) solr = solr_interface() repo = Repository() q = solr.query(content_model=DiskImage.DISKIMAGE_CONTENT_MODEL) \ .exclude(collection_id=self.collections['trethewey']) \ .exclude(collection_id=self.collections['rushdie']) \ .exclude(collection_id=self.collections['mackey']) \ .exclude(collection_id=self.collections['clifton']) \ .exclude(collection_id=self.collections['grennan']) \ .field_limit('pid') if self.verbosity >= self.v_normal: self.stderr.write('Found %d disk images not in restricted collections' % q.count()) # currently there is no way to filter on format or size in either # solr or fedora risearch # so, go through individually and group them by type, # then sort by size and pick the smallest ones diskimgs_by_type = defaultdict(list) for result in q: diskimg = repo.get_object(result['pid'], type=DiskImage) if not diskimg.exists: if self.verbosity >= self.v_normal: self.stderr.write('Referenced disk image %s does not exist or is inaccessible' \ % result['pid']) continue fmt = diskimg.provenance.content.object.format.name diskimgs_by_type[fmt].append(diskimg) for fmt, diskimages in diskimgs_by_type.iteritems(): if self.verbosity >= self.v_normal: self.stderr.write('Selecting %s disk images' % fmt) # sort on binary file size so we sync the smallest ones diskimages = sorted(diskimages, key=lambda diskimg: diskimg.content.size) # use the first 10 of each type for d in diskimages[:10]: self.stdout.write(d.pid)
def playlist(request, pid): # FIXME: this needs last-modified so browser can cache!!! # NOTE: preliminary logic duplicated from view above repo = Repository(request=request) obj = repo.get_object(pid, type=CollectionObject) # if pid doesn't exist or isn't a collection, 404 if not obj.exists or not obj.has_requisite_content_models: raise Http404 # search for all items that belong to this collection q = obj.solr_items_query() q = q.sort_by('date_created') \ .sort_by('date_issued') \ .sort_by('title_exact') # filter by logged-in user permissions # (includes researcher-accessible content filter when appropriate) q = filter_by_perms(q, request.user) # if current user can only view researcher-accesible collections and # no items were found, they don't have permission to view this collection if not request.user.has_perm('collection.view_collection') and \ request.user.has_perm('collection.view_researcher_collection') and \ q.count() == 0: return prompt_login_or_403(request) playlist = [] for result in q: # skip non-audio or audio without access copies if result['object_type'] != 'audio' or not result['has_access_copy']: continue data = { 'title': result['title'], 'free': False # explicitly mark as not downloadable } if result['access_copy_mimetype'] == 'audio/mp4': audio_type = 'm4a' else: audio_type = 'mp3' data[audio_type] = reverse('audio:download-compressed-audio', kwargs={'pid': result['pid'], 'extension': audio_type}) playlist.append(data) return HttpResponse(json.dumps(playlist), content_type='application/json')
def simple_edit(request, pid=None): ''' Edit an existing Fedora :class:`~keep.collection.models.SimpleCollection`. If a pid is specified, attempts to retrieve an existing object. ''' repo = Repository(request=request) try: obj = repo.get_object(pid=pid, type=SimpleCollection) if request.method == 'POST': form = SimpleCollectionEditForm(request.POST) if form.is_valid(): status = form.cleaned_data['status'] if status == obj.mods.content.restrictions_on_access.text: # don't queue job if there is no change messages.info(request, 'Status is unchanged') else: # queue celery task to update items in this batch queue_batch_status_update(obj, status) messages.info( request, 'Batch status update has been queued; ' + 'please check later via <a href="%s">recent tasks</a> page' % reverse('tasks:recent') ) else: #Just Display the form form = SimpleCollectionEditForm(initial={'status': obj.mods.content.restrictions_on_access.text}) except RequestFailed, e: # if there was a 404 accessing objects, raise http404 # NOTE: this probably doesn't distinguish between object exists with # no MODS and object does not exist at all if e.code == 404: raise Http404 # otherwise, re-raise and handle as a common fedora connection error else: raise
def by_arrangement_id(id, repo=None): ''' Static method to find an :class:`ArrangementObject` by its local or arrangement id. Looks for the item in Solr and returns an :class:`ArrangementObject` instance initialized from the repository if a single match is found for the requested id. Raises :class:`django.core.exceptions.MultipleObjectsReturned` if more than one match is found; raises :class:`django.core.exceptions.ObjectDoesNotExist` if no matches are found in the Solr index. :param id: arrangement id or local id :param repo: optional :class:`eulfedora.server.Repository` to use an existing connection with specific credentials :returns: :class:`ArrangementObject` ''' solr = solr_interface() q = solr.query(arrangement_id=id, content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) \ .field_limit('pid') # check that we found one and only one found = len(q) # borrowing custom django exceptions for not found / too many # matches if found > 1: raise MultipleObjectsReturned('Found %d records with arrangement id %s' % \ (found, id)) if not found: raise ObjectDoesNotExist('No record found with arrangement id %s' % id) if repo is None: repo = Repository() return repo.get_object(q[0]['pid'], type=ArrangementObject)
def view(request, pid): '''View a single :class:`~keep.audio.models.AudioObject`. User must either have general view audio permissions, or if they have view researcher audio, the object must be researcher accessible (based on rights codes). ''' repo = Repository(request=request) obj = repo.get_object(pid, type=AudioObject) # user either needs view audio permissions OR # if they can view researcher audio and object must be researcher-accessible if not request.user.has_perm('audio.view_audio') and \ not (request.user.has_perm('audio.view_researcher_audio') and \ bool(obj.researcher_access)): return prompt_login_or_403(request) try: if not obj.has_requisite_content_models: raise Http404 except: raise Http404 return TemplateResponse(request, 'audio/view.html', {'resource': obj})
def largefile_ingest(request): '''Large-file ingest. On GET, displays a form allowing user to select a BagIt that has been uploaded to the configured large-file ingest staging area for ingest and association with a collection. ''' # ingest content from upload staging area context = {} template_name = 'file/largefile_ingest.html' form = None # on POST, process the form and ingest if valid if request.method == 'POST': form = LargeFileIngestForm(request.POST) # if form is not valid, add to context for redisplay with errors if not form.is_valid(): context['form'] = form # otherwise, process the form else: repo = Repository(request=request) # Get collection & check for optional comment collection = repo.get_object(pid=form.cleaned_data['collection'], type=CollectionObject) # get user comment if any; default to a generic ingest comment comment = form.cleaned_data['comment'] or 'initial repository ingest' bag = form.cleaned_data['bag'] # create dict with file info to add success/failure info file_info = {'label': os.path.basename(bag)} #assuming type of ingest from subdirectory type = bag.split('/')[-2] try: if type == 'diskimage': obj = DiskImage.init_from_bagit(bag, request) elif type == 'video': obj = Video.init_from_bagit(bag, request) # set collection on ingest obj.collection = collection ## NOTE: Due to a bug in Fedora 3.4 with checksums and ## and file uri ingest, the content datastream checksum ## must be cleared before ingest; manually check it ## after ingest to confirm Fedora calculated what we expect. ## This work-around can be removed once we upgrade to Fedora 3.6 # store datastream checksum that would be sent to fedora checksum = obj.content.checksum obj._content_checksum = checksum # clear it out so Fedora can ingest without erroring obj.content.checksum = None # file URIs also used for supplemental files; needs # to be handled the same way as content datastream # - look for any supplementN datastreams, store checksum, and remove supplemental_checksums = {} for i in range(20): try: dsid = 'supplement%d' % i dsobj = getattr(obj, dsid) supplemental_checksums[dsid] = dsobj.checksum dsobj.checksum = None except AttributeError: # stop iterating - we have found last supplemental file break # same for access copy checksum on Video files if type == 'video': access_checksum = obj.access_copy.checksum obj.access_copy.checksum = None pids_exists = [] if type == 'video': pids_exists = repo.find_objects(type=Video, label=obj.label) if type == 'diskimage': pids_exists = repo.find_objects(type=DiskImage, label=obj.label) exists = 0 for pid in pids_exists: if pid.pid: exists += 1 if exists == 0: obj.save(comment) else: raise ValueError('Duplicate content detected.') # remove the ingested bag from large-file staging area shutil.rmtree(bag) # re-init to allow checking fedora-calculated checksums on # supplemental datastreams if type == 'diskimage': obj = repo.get_object(obj.pid, type=DiskImage) elif type == 'video': obj = repo.get_object(obj.pid, type=Video) # if save succeded (no exceptions), set summary info for display file_info.update({'type' : type, 'success': True, 'pid': obj.pid, 'url': obj.get_absolute_url(), 'checksum': obj.content.checksum}) if type == 'video': file_info['access_checksum'] = obj.access_copy.checksum # compare checksum generated by Fedora # (required because of file uri bug in fedora 3.4; # this can be removed once we upgrade to fedora 3.6+) checksum_errors = [] if obj.content.checksum != checksum: checksum_errors.append('content') for dsid, checksum in supplemental_checksums.iteritems(): dsobj = obj.getDatastreamObject(dsid) if dsobj.checksum != checksum: checksum_errors.append(dsid) if type == 'video' and obj.access_copy.checksum != access_checksum: checksum_errors.append('access_copy') if checksum_errors: message = 'Checksum mismatch%s detected on ' + \ '%s datastream%s; please contact a repository administrator.''' file_info['message'] = message % ( 'es' if len(checksum_errors) > 1 else '', ', '.join(checksum_errors), 's' if len(checksum_errors) > 1 else '' ) except bagit.BagValidationError as err: logger.error(err) file_info.update({'success': False, 'message': 'BagIt error: %s' % err}) # special case: detected as duplicate content except DuplicateContent as e: # mark as failed and generate message with links to records # NOTE: pid url is duplicated logic from web upload view... links = [] for pid in e.pids: # use fedora type-inferring logic with list of content models # pulled from solr results obj = repo.get_object(pid, type=repo.best_subtype_for_object(pid, e.pid_cmodels[pid])) # use appropriate object class to get the object url links.append('<a href="%s">%s</a>' % ( obj.get_absolute_url(), pid) ) msg = mark_safe('%s: %s' % (unicode(e), '; '.join(links))) file_info.update({ 'success': False, 'message': msg }) except Exception as err: logger.error('Error: %s' % err) file_info.update({'success': False, 'message': '%s' % err}) # report success/failure in the same format as web-upload ingest context['ingest_results'] = [file_info] messages.success(request, 'Ingest results: %s' % file_info) return HttpResponseRedirect("/admin") # on GET display form to select item(s) for ingest # OR on completed valid form post files = large_file_uploads() if request.method == 'GET' or \ form is not None and form.is_valid(): if len(files): context['form'] = LargeFileIngestForm() else: # indicator that no files are available for ingest context['no_files'] = True return TemplateResponse(request, template_name, context)
def generate_collection(self): '''Generate a :class:`CollectionObject` with fields pre-populated based on the contents of the current Finding Aid object. ''' repo = Repository() coll = repo.get_object(type=CollectionObject) # TODO: archive membership? # title - using 'short' form without unitdate, stripping any trailing whitespace & . or , # TODO/FIXME: does NOT work for unittitles with nested tags, e.g. title - see pomerantz coll.mods.content.title = unicode(self.unittitle.short).rstrip().rstrip('.,') # main entry/name - origination, if any if self.archdesc.did.origination: name_text = unicode(self.archdesc.did.origination) # determine type of name colltype = self.archdesc.did.node.xpath('''local-name(e:origination/e:persname | e:origination/e:corpname | e:origination/e:famname)''', namespaces=self.ROOT_NAMESPACES) if colltype == 'persname': name_type = 'personal' elif colltype == 'famname': name_type = 'family' # family names consistently end with a period, which can be removed name_text = name_text.rstrip('.') elif colltype == 'corpname': name_type = 'corporate' if name_type is not None: coll.mods.content.create_name() coll.mods.content.name.type = name_type authority = self.archdesc.did.node.xpath('string(e:origination/*/@source)', namespaces=self.ROOT_NAMESPACES) # lcnaf in the EAD is equivalent to naf in MODS if authority == 'lcnaf': coll.mods.content.name.authority = 'naf' coll.mods.content.name.name_parts.append(mods.NamePart(text=name_text)) # date coverage if self.coverage: date_encoding = {'encoding': 'w3cdtf'} # date range coll.mods.content.create_origin_info() if '/' in self.coverage: start, end = self.coverage.split('/') coll.mods.content.origin_info.created.append(mods.DateCreated(date=start, point='start', key_date=True, **date_encoding)) coll.mods.content.origin_info.created.append(mods.DateCreated(date=end, point='end', **date_encoding)) # single date else: coll.mods.content.origin_info.created.append(mods.DateCreated(date=self.coverage, key_date=True, **date_encoding)) # source id - numeric form of the manuscript/archive collection number coll.mods.content.source_id = self.archdesc.did.unitid.identifier # access restriction if self.archdesc.access_restriction: coll.mods.content.create_restrictions_on_access() coll.mods.content.restrictions_on_access.text = "\n".join([ unicode(c) for c in self.archdesc.access_restriction.content]) # use & reproduction if self.archdesc.use_restriction: coll.mods.content.create_use_and_reproduction() coll.mods.content.use_and_reproduction.text = "\n".join([ unicode(c) for c in self.archdesc.use_restriction.content]) # set initial mods:typeOfResource - not specified in EAD, but all # collections shoud be mixed material coll.mods.content.resource_type = 'mixed material' # EAD url - where does this go? # accessible at self.eadid.url return coll
def ingest_files(files, collection, comment, request): '''Ingest a dictionary of files as returned by :meth:`keep.files.forms.UploadForm.files_to_ingest`. Returns a dictionary reporting per-file ingest success or failure. :param files: dictionary of files to be ingested :param collection: :class:`~keep.collection.models.CollectionObject` that newly ingested objects should be associated with :param comment: save message for fedora ingest :param request: :class:`~django.http.HttpRequest`, to access Fedora and ingest new objects as the logged-in user. ''' # NOTE: using this structure for easy of display in django templates (e.g., regroup) results = [] m = magic.Magic(mime=True) for filename, label in files.iteritems(): file_info = {'label': label} # check if file is an allowed type # NOTE: for single-file upload, browser-set type is # available as UploadedFile.content_type - but since # browser mimetypes are unreliable, calculate anyway try: type = m.from_file(filename) except IOError: raise Exception('Uploaded file is no longer available for ingest; please try again.') type, separator, options = type.partition(';') if type not in allowed_upload_types(request.user): # store error for display on detailed result page file_info.update({'success': False, 'message': '''File type '%s' is not allowed''' % type}) # if not an allowed type, no further processing results.append(file_info) continue if collection is None: file_info.update({'success': False, 'message': '''Collection not selected'''}) results.append(file_info) continue # if there is an MD5 file (i.e., file was uploaded via ajax), # use the contents of that file as checksum if os.path.exists(filename + '.md5'): with open(filename + '.md5') as md5file: md5 = md5file.read() # otherwise, calculate the MD5 (single-file upload) else: md5 = md5sum(filename) # determine what type of object to initialize based on mimetype objtype = None for t in uploadable_objects: if type in t.allowed_mimetypes: objtype = t break # initialize a new object from the file obj = objtype.init_from_file(filename, initial_label=label, request=request, checksum=md5, mimetype=type) # set collection on ingest obj.collection = collection try: # NOTE: by sending a log message, we force Fedora to store an # audit trail entry for object creation, which doesn't happen otherwise obj.save(comment) file_info.update({'success': True, 'pid': obj.pid, 'url': obj.get_absolute_url(), 'checksum': md5}) # if audio, needs an additional step: if objtype == AudioObject: # Start asynchronous task to convert audio for access # NOTE: not passing in user-upload file so that # celery can more easily be run on a separate server queue_access_copy(obj) # remove the file now that we have sucessfully ingested os.remove(filename) # NOTE: could remove MD5 file (if any) here, but MD5 files # should be small and will get cleaned up by the cron script # special case: detected as duplicate content except DuplicateContent as e: # mark as failed and generate message with links to records links = [] repo = Repository(request=request) for pid in e.pids: # use fedora type-inferring logic with list of content models # pulled from solr results obj = repo.get_object(pid, type=repo.best_subtype_for_object(pid, e.pid_cmodels[pid])) # use appropriate object class to get the object url links.append('<a href="%s">%s</a>' % ( obj.get_absolute_url(), pid) ) msg = mark_safe('%s: %s' % (unicode(e), '; '.join(links))) file_info.update({ 'success': False, 'message': msg }) except Exception as e: logger.error('Error ingesting %s: %s' % (filename, e)) logger.debug("Error details:\n" + traceback.format_exc()) file_info['success'] = False # check for Fedora-specific errors if isinstance(e, RequestFailed): if 'Checksum Mismatch' in e.detail: file_info['message'] = 'Ingest failed due to a checksum mismatch - ' + \ 'file may have been corrupted or incompletely uploaded to Fedora' else: file_info['message'] = 'Fedora error: ' + unicode(e) # non-fedora error else: file_info['message'] = 'Ingest failed: ' + unicode(e) finally: # no matter what happened, store results for reporting to user results.append(file_info) return results
def manage_supplements(request, pid): '''Manage supplemental file datastreams associated with a :class:`~keep.file.models.DiskImage`.''' repo = Repository(request=request) obj = repo.get_object(pid, type=DiskImage) if not obj.exists or not obj.has_requisite_content_models: raise Http404 # generate initial data from any existing supplemental datastreams initial_data = [] for s in obj.supplemental_content: initial_data.append({ 'dsid': s.id, 'label': s.label, 'file': DatastreamFile(obj.pid, s.id, s.label) }) # on get, just display the form if request.method == 'GET': formset = SupplementalFileFormSet(initial=initial_data) # on post, process the form and any updates/additions if request.method == 'POST': formset = SupplementalFileFormSet(request.POST, request.FILES, initial=initial_data) if formset.is_valid(): m = magic.Magic(mime=True) # NOTE: because we currently don't support re-ordering # or deletion, simply counting to keep track of datastream ids s_id = 0 modified = 0 added = 0 for file_info in formset.cleaned_data: # skip empty formset if not file_info: continue if file_info.get('dsid', None): ds = obj.getDatastreamObject( file_info['dsid'], dsobj_type=FileDatastreamObject) # ds = getattr(obj, file_info['dsid']) else: added += 1 ds = obj.getDatastreamObject( 'supplement%d' % s_id, dsobj_type=FileDatastreamObject) # only set if changed so datastream isModified is accurate if file_info['label'] != ds.label: ds.label = file_info['label'] # if this is an uploaded file, replace content and calculate mimetype, checksum if isinstance(file_info['file'], UploadedFile): filename = file_info['file'].temporary_file_path() mimetype = m.from_file(filename) mimetype, separator, options = mimetype.partition(';') ds.mimetype = mimetype ds.checksum = md5sum(filename) ds.content = file_info['file'] if ds.exists and ds.isModified(): modified += 1 s_id += 1 try: obj.save('updating supplemental files') # summarize number of changes, if any if added or modified: msg_add = 'added %d' % added if added else '' msg_update = 'updated %d' % modified if modified else '' msg = 'Successfully %s%s%s supplemental file%s' % \ (msg_add, ' and ' if added and modified else '', msg_update, 's' if (added + modified) != 1 else '') messages.success(request, msg) else: # possible for the form to be valid but not make any changes messages.info(request, 'No changes made to supplemental content') return HttpResponseSeeOtherRedirect( reverse('file:edit', args=[pid])) except Exception as e: logger.error('Error on supplemental file update: %s' % e) logger.debug("Error details:\n" + traceback.format_exc()) messages.error(request, unicode(e)) # for now, just redisplay the form with error message return TemplateResponse(request, 'file/supplemental_content.html', { 'obj': obj, 'formset': formset })
def upload(request): '''Upload file(s) and create new fedora :class:`~keep.audio.models.AudioObject` (s). Only accepts audio/x-wav currently. There are two distinct ways to upload file. The first case is kicked off when "fileManualUpload" exists in the posted form. If it does, then this was not a HTML5 browser, and the file upload occurs as is usual for a single file upload. In the other approach, the file was uploaded via a HTML5 ajax upload already. In this case, we are reading in various hidden generated form fields that indicate what was uploaded from the javascript code. ''' repo = Repository(request=request) ctx_dict = { # list of allowed file types, in a format suited for passing to javascript 'js_allowed_types': mark_safe(json.dumps(allowed_upload_types(request.user))) } if request.method == 'POST': content_type = request.META.get('CONTENT_TYPE', 'application/octet-stream') media_type, sep, options = content_type.partition(';') # content type is technically case-insensitive; lower-case before comparing media_type = media_type.strip().lower() # if form has been posted, process & ingest files if media_type == 'multipart/form-data': # check for a single file upload form = UploadForm(request.POST, request.FILES) # If form is not valid (i.e., no collection specified, no # or mismatched files uploaded), bail out and redisplay # form with any error messages. if not form.is_valid(): ctx_dict['form'] = form return TemplateResponse(request, 'file/upload.html', ctx_dict) # Form is valid. Get collection & check for optional comment collection = repo.get_object(pid=form.cleaned_data['collection'], type=CollectionObject) # get user comment if any; default to a generic ingest comment comment = form.cleaned_data[ 'comment'] or 'initial repository ingest' # get dictionary of file path -> filename, based on form data files_to_ingest = form.files_to_ingest() # process all files submitted for ingest (single or batch mode) if files_to_ingest: results = ingest_files(files_to_ingest, collection, comment, request) # add per-file ingest result status to template context ctx_dict['ingest_results'] = results # after processing files, fall through to display upload template else: # POST but not form data - handle ajax file upload return ajax_upload(request) # on GET or non-ajax POST, display the upload form ctx_dict['form'] = UploadForm() # convert list of allowed types for passing to javascript return TemplateResponse(request, 'file/upload.html', ctx_dict)
class EmailMessageTest(KeepTestCase): def setUp(self): self.repo = Repository() self.pids = [] # test EmailMessage self.email = self.repo.get_object(type=EmailMessage) self.email.cerp.content.from_list = ['*****@*****.**'] self.email.cerp.content.to_list = ['*****@*****.**'] self.email.cerp.content.subject_list = ['Interesting Subject'] def tearDown(self): for pid in self.pids: self.repo.purge_object(pid) def test_headers(self): h1 = cerp.Header() h1.name = "HEADER 1" h1.value = "value for header 1" h2 = cerp.Header() h2.name = "HEADER 2" h2.value = "value for header 2" self.email.cerp.content.headers.append(h1) self.email.cerp.content.headers.append(h2) self.assertEqual(self.email.headers['HEADER 1'], 'value for header 1') self.assertEqual(self.email.headers['HEADER 2'], 'value for header 2') def test_email_label(self): # no object label and one person in to field label = self.email.email_label() self.assertEqual( 'Email from [email protected] to [email protected] Interesting Subject', label, 'Should construct label when it does not exist') # more then one person in to list self.email.cerp.content.to_list.append('*****@*****.**') label = self.email.email_label() self.assertEqual( 'Email from [email protected] to [email protected] et al. Interesting Subject', label, 'only show first to email address when there are more than one') # no subject self.email.cerp.content.subject_list = [] self.assertEqual( 'Email from [email protected] to [email protected] et al.', self.email.email_label(), 'Display message without subject when no subject is present') # has a date date_header = cerp.Header() date_header.name = 'Date' date_header.value = 'Friday 13 200 13:00' self.email.cerp.content.headers.append(date_header) label = self.email.email_label() self.assertEqual( 'Email from [email protected] to [email protected] et al. on Friday 13 200 13:00', label, 'only show first to email address when there are more than one') # object label already exists self.email.label = "label we want to keep" label = self.email.email_label() self.assertEqual(self.email.label, label, 'label should be preserved when it exists') def test_index_data(self): # NOTE: logic for creating the label is in the label test # test to make sure label exists in index data data = self.email.index_data() self.assertIn('label', data.keys()) # mime_data does not exist, so no c self.assert_( 'content_md5' not in data, 'content_md5 should not be set when mime data does not exist') # patch mime data to test exists /cchecksum with patch.object(self.email, 'mime_data', Mock()) as mock_mime: mock_mime.exists = True mock_mime.checksum = 'test checksum value' data = self.email.index_data() self.assertEqual(self.email.mime_data.checksum, data['content_md5']) @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface) def test_by_checksum(self, mocksolr): # no match self.assertRaises(ObjectDoesNotExist, EmailMessage.by_checksum, 42) solr = mocksolr.return_value solr.query.assert_called_with( content_md5=42, content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) solr.query.return_value.field_limit.assert_called_with('pid') # too many matches solr.query.return_value.field_limit.return_value = [{ 'pid': 'pid:1' }, { 'pid': 'pid:2' }] self.assertRaises(MultipleObjectsReturned, EmailMessage.by_checksum, 42) # one match solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}] em = EmailMessage.by_checksum(42) self.assert_(isinstance(em, EmailMessage)) # custom repo object mockrepo = Mock() em = EmailMessage.by_checksum(42, mockrepo) mockrepo.get_object.assert_called_with('pid:1', type=EmailMessage) @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface) def test_by_message_id(self, mocksolr): # no match self.assertRaises(ObjectDoesNotExist, EmailMessage.by_message_id, '<*****@*****.**>') solr = mocksolr.return_value solr.query.assert_called_with( arrangement_id='<*****@*****.**>', content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) solr.query.return_value.field_limit.assert_called_with('pid')
def edit(request, pid): '''Edit the metadata for a single :class:`~keep.file.models.DiskImage`.''' # FIXME: should be generic file (?) or possibly one of several supported files repo = Repository(request=request) obj = repo.get_object(pid, type=DiskImage) try: # if this is not actually a disk image, then 404 (object is not available at this url) if not obj.has_requisite_content_models: raise Http404 if request.method == 'POST': # if data has been submitted, initialize form with request data and object mods form = DiskImageEditForm(request.POST, instance=obj) if form.is_valid(): # includes schema validation # update foxml object with data from the form form.update_instance() if 'comment' in form.cleaned_data \ and form.cleaned_data['comment']: comment = form.cleaned_data['comment'] else: comment = "update metadata" obj.save(comment) messages.success(request, 'Successfully updated <a href="%s">%s</a>' % \ (reverse('file:edit', args=[pid]), pid)) # save & continue functionality - same as collection edit if '_save_continue' not in request.POST: return HttpResponseSeeOtherRedirect( reverse('repo-admin:dashboard')) # otherwise - fall through to display edit form again # form was posted but not valid else: # if we attempted to save and failed, add a message since the error # may not be obvious or visible in the first screenful of the form messages.error( request, '''Your changes were not saved due to a validation error. Please correct any required or invalid fields indicated below and save again.''' ) else: # GET - display the form for editing, pre-populated with content from the object form = DiskImageEditForm(instance=obj) class AdminOpts(object): app_label = 'file' model_name = 'application' # options for generating admin link to edit/add file application db info admin_fileapp = AdminOpts() return TemplateResponse(request, 'file/edit.html', { 'obj': obj, 'form': form, 'admin_fileapp': admin_fileapp }) except PermissionDenied: # Fedora may return a PermissionDenied error when accessing a datastream # where the datastream does not exist, object does not exist, or user # does not have permission to access the datastream # check that the object exists - if not, 404 if not obj.exists: raise Http404 # for now, assuming that if object exists and has correct content models, # it will have all the datastreams required for this view return HttpResponseForbidden('Permission Denied to access %s' % pid, content_type='text/plain') except RequestFailed as rf: # if fedora actually returned a 404, propagate it if rf.code == 404: raise Http404 msg = 'There was an error contacting the digital repository. ' + \ 'This prevented us from accessing audio data. If this ' + \ 'problem persists, please alert the repository ' + \ 'administrator.' return HttpResponse(msg, content_type='text/plain', status=500)
def batch_set_status(pid, status): repo = Repository() batch = repo.get_object(pid, type=SimpleCollection) # keep track of totals for success and failure success = 0 error = 0 # translate form status codes to fedora state code # TODO: shift this logic to arrangement object for re-use ? codes = {'Processed': 'A', 'Accessioned': 'I'} # target state for every object in the collection if status not in codes: err_msg = 'Status %s unknown' % status logger.error(err_msg) raise Exception(err_msg) else: state = codes[status] # finp all pids associated with this object pids = list( batch.rels_ext.content.objects(batch.uriref, relsextns.hasMember)) for pid in pids: try: # pass in api from batch object to retain user credentials obj = ArrangementObject(batch.api, pid) obj.state = state obj.save('Marking as %s via SimpleCollection %s' % (status, batch.pid)) success += 1 except Exception as e: logger.error('Failed to update %s : %s' % (pid, e)) error += 1 info = { 'success': success, 'error': error, 'success_plural': '' if success == 1 else 's', 'error_plural': '' if error == 1 else 's', 'status': status } summary_msg = "Successfully updated %(success)s item%(success_plural)s; error updating %(error)s" % info # if not all objects were updated correctly, exit with error if error > 0: raise Exception(summary_msg) # FIXME: this is based on the current form logic, but could leave # some member items stranded in a different status than the parent object batch.mods.content.create_restrictions_on_access() batch.mods.content.restrictions_on_access.text = status # Change collection status try: batch.save( 'Marking as %(status)s; updated %(success)s member item%(success_plural)s' % info) except Exception as e: save_err = "Error updating SimpleCollection %s - %s" % (obj.pid, e) logger.error(save_err) raise Exception('%s; %s' % (save_err, summary_msg)) # success return 'Successfully updated %(success)s item%(success_plural)s' % info
class TestMigrateRushdie(TestCase): MM_FIXTURE ='''<macfs:document xmlns:macfs="info:fedora/emory-control:Rushdie-MacFsData-1.0"> <macfs:md5>ffcf48e5df673fc7de985e1b859eeeec</macfs:md5> <macfs:file> <macfs:computer>Performa 5400</macfs:computer> <macfs:path>/Hard Disk/MIDNIGHT'S CHILDREN/MISC. MATERIAL/x - the roles</macfs:path> <macfs:rawpath>L0hhcmQgRGlzay9NSUROSUdIVCdTIENISUxEUkVOL01JU0MuIE1BVEVSSUFML3ggLSB0aGUgcm9sZXM=</macfs:rawpath> <macfs:attributes>avbstclInmedz</macfs:attributes> <macfs:created>1997-01-19T19:29:32</macfs:created> <macfs:modified>1997-01-19T19:29:32</macfs:modified> <macfs:type>TEXT</macfs:type> <macfs:creator>ttxt</macfs:creator> </macfs:file> </macfs:document>''' MA_FIXTURE ='''<marbl:analysis xmlns:marbl="info:fedora/emory-control:Rushdie-MarblAnalysis-1.0"> <marbl:series>Writings by Rushdie</marbl:series> <marbl:subseries>Fiction</marbl:subseries> <marbl:verdict>As is</marbl:verdict> </marbl:analysis>''' SERIES_FIXTURE = {'Writings by Rushdie': { 'series_info': {'base_ark': 'http://testpid.library.emory.edu/ark:/25593/80mvk', 'id': 'rushdie1000_series2', 'short_id': 'series2', 'uri': 'https://findingaids.library.emory.edu/documents/rushdie1000/series2'}, 'subseries_info': { 'Fiction': { 'base_ark': 'http://testpid.library.emory.edu/ark:/25593/80mvk', 'id': 'rushdie1000_subseries2.1', 'short_id': 'subseries2.1', 'uri': 'https://findingaids.library.emory.edu/documents/rushdie1000/series2/subseries2.1'}}}} def setUp(self): self.repo = Repository() self.pids = [] #Create a simple Collection self.sc = self.repo.get_object(type=SimpleCollection) self.sc.label = "SimpleCollection For Test" self.sc.save() self.pids.append(self.sc.pid) #Create a Master Collection self.mc = self.repo.get_object(type=CollectionObject) self.mc.label = "MasterCollection For Test" self.mc.save() self.pids.append(self.mc.pid) #Create a a DigitalObject self.digObj = self.repo.get_object(type=RushdieArrangementFile) self.digObj.label = "Object For Test" self.digObj.save() self.pids.append(self.digObj.pid) self.digObj.api.addDatastream(self.digObj.pid, "MARBL-MACTECH", "MARBL-MACTECH", mimeType="application/xml", content= self.MM_FIXTURE) self.digObj.api.addDatastream(self.digObj.pid, "MARBL-ANALYSIS", "MARBL-ANALYSIS", mimeType="application/xml", content= self.MA_FIXTURE) #Remove Arrangement model so it can be added later relation = (self.digObj.uriref, modelns.hasModel, "info:fedora/emory-control:Arrangement-1.0") self.digObj.rels_ext.content.remove(relation) self.digObj.save() #Setup Command self.cmd = migrate_rushdie.Command() self.cmd.verbosity = 1 self.cmd.v_normal = 1 self.cmd.v_none = 0 self.cmd.simple_collection = self.sc self.cmd.stdout = sys.stdout self.cmd.CONTENT_MODELS = CONTENT_MODELS self.cmd.repo = self.repo def tearDown(self): for pid in self.pids: self.repo.purge_object(pid) def test__add_to_simple_collection(self): self.cmd._add_to_simple_collection(self.digObj) self.assertTrue((self.sc.uriref, relsextns.hasMember, self.digObj.uriref) in self.sc.rels_ext.content, "%s shold be a member of the Simplecollection" % self.digObj.pid ) def test__get_unique_objects(self): #duplicate pids are processed only once objs = self.cmd._get_unique_objects([self.digObj.pid, self.digObj.pid]) self.assertEqual(len(objs), 1, "No dup pids should be processed") def test__convert_ds(self): obj = self.cmd._convert_ds(self.digObj, self.mc, self.SERIES_FIXTURE, False) #Check all fields are moved over correctly #filetech self.assertEqual(obj.filetech.content.file[0].md5, "ffcf48e5df673fc7de985e1b859eeeec") self.assertEqual(obj.filetech.content.file[0].computer, "Performa 5400") self.assertEqual(obj.filetech.content.file[0].path, "/Hard Disk/MIDNIGHT'S CHILDREN/MISC. MATERIAL/x - the roles") self.assertEqual(obj.filetech.content.file[0].rawpath, "L0hhcmQgRGlzay9NSUROSUdIVCdTIENISUxEUkVOL01JU0MuIE1BVEVSSUFML3ggLSB0aGUgcm9sZXM=") self.assertEqual(obj.filetech.content.file[0].attributes, "avbstclInmedz") self.assertEqual(obj.filetech.content.file[0].created, "1997-01-19T19:29:32") self.assertEqual(obj.filetech.content.file[0].modified, "1997-01-19T19:29:32") self.assertEqual(obj.filetech.content.file[0].type, "TEXT") self.assertEqual(obj.filetech.content.file[0].creator, "ttxt") #MODS self.assertEqual(obj.mods.content.series.title, "Fiction") self.assertEqual(obj.mods.content.series.uri, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]["Fiction"]["uri"]) self.assertEqual(obj.mods.content.series.base_ark, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]["Fiction"]["base_ark"]) self.assertEqual(obj.mods.content.series.full_id, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]["Fiction"]["id"]) self.assertEqual(obj.mods.content.series.short_id, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]["Fiction"]["short_id"]) self.assertEqual(obj.mods.content.series.series.title, "Writings by Rushdie") self.assertEqual(obj.mods.content.series.series.uri, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["uri"]) self.assertEqual(obj.mods.content.series.series.base_ark, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["base_ark"]) self.assertEqual(obj.mods.content.series.series.full_id, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["id"]) self.assertEqual(obj.mods.content.series.series.short_id, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["short_id"]) #Rights self.assertEqual(obj.rights.content.access_status.code, "2") #RELS-EXT self.assertTrue((obj.uriref, relsextns.isMemberOf, self.mc.uriref) in obj.rels_ext.content, "Object should have isMember relation to master collection") self.assertTrue((obj.uriref, modelns.hasModel, URIRef("info:fedora/emory-control:ArrangementAccessAllowed-1.0")) in obj.rels_ext.content, "Object should have Allowed Content Model") #Label and DS self.assertEqual(obj.label, "x - the roles", "Label should be set to last part of path") self.assertEqual(obj.owner, "thekeep-project", "owner should be set to 'thekeep-project'") self.assertEqual(obj.dc.content.title, "x - the roles", "DC title should be set to last part of path") #DataStreams #have to reload obj from repository to get DS update obj = self.repo.get_object(pid=obj.pid, type=ArrangementObject) self.assertFalse("MARBL-MACTECH" in obj.ds_list, "MARBL-MACTECH should have been removed") self.assertFalse("MARBL-ANALYSIS" in obj.ds_list, "MARBL-ANALYSIS should have been removed") def test_missing_series_info(self): #Remove subseries info from lookup series = self.SERIES_FIXTURE.copy() del series["Writings by Rushdie"]["subseries_info"] obj = self.cmd._convert_ds(self.digObj, self.mc, self.SERIES_FIXTURE, False) self.assertEqual(obj.mods.content.series.title, "Fiction") self.assertEqual(obj.mods.content.series.series.title, "Writings by Rushdie")
def upload(request): '''Upload file(s) and create new fedora :class:`~keep.audio.models.AudioObject` (s). Only accepts audio/x-wav currently. There are two distinct ways to upload file. The first case is kicked off when "fileManualUpload" exists in the posted form. If it does, then this was not a HTML5 browser, and the file upload occurs as is usual for a single file upload. In the other approach, the file was uploaded via a HTML5 ajax upload already. In this case, we are reading in various hidden generated form fields that indicate what was uploaded from the javascript code. ''' repo = Repository(request=request) ctx_dict = { # list of allowed file types, in a format suited for passing to javascript 'js_allowed_types': mark_safe(json.dumps(allowed_upload_types(request.user))) } if request.method == 'POST': content_type = request.META.get('CONTENT_TYPE', 'application/octet-stream') media_type, sep, options = content_type.partition(';') # content type is technically case-insensitive; lower-case before comparing media_type = media_type.strip().lower() # if form has been posted, process & ingest files if media_type == 'multipart/form-data': # check for a single file upload form = UploadForm(request.POST, request.FILES) # If form is not valid (i.e., no collection specified, no # or mismatched files uploaded), bail out and redisplay # form with any error messages. if not form.is_valid(): ctx_dict['form'] = form return TemplateResponse(request, 'file/upload.html', ctx_dict) # Form is valid. Get collection & check for optional comment collection = repo.get_object(pid=form.cleaned_data['collection'], type=CollectionObject) # get user comment if any; default to a generic ingest comment comment = form.cleaned_data['comment'] or 'initial repository ingest' # get dictionary of file path -> filename, based on form data files_to_ingest = form.files_to_ingest() # process all files submitted for ingest (single or batch mode) if files_to_ingest: results = ingest_files(files_to_ingest, collection, comment, request) # add per-file ingest result status to template context ctx_dict['ingest_results'] = results # after processing files, fall through to display upload template else: # POST but not form data - handle ajax file upload return ajax_upload(request) # on GET or non-ajax POST, display the upload form ctx_dict['form'] = UploadForm() # convert list of allowed types for passing to javascript return TemplateResponse(request, 'file/upload.html', ctx_dict)
def batch_set_status(pid, status): repo = Repository() batch = repo.get_object(pid, type=SimpleCollection) # keep track of totals for success and failure success = 0 error = 0 # translate form status codes to fedora state code # TODO: shift this logic to arrangement object for re-use ? codes = {'Processed': 'A', 'Accessioned': 'I'} # target state for every object in the collection if status not in codes: err_msg = 'Status %s unknown' % status logger.error(err_msg) raise Exception(err_msg) else: state = codes[status] # finp all pids associated with this object pids = list(batch.rels_ext.content.objects(batch.uriref, relsextns.hasMember)) for pid in pids: try: # pass in api from batch object to retain user credentials obj = ArrangementObject(batch.api, pid) obj.state = state obj.save('Marking as %s via SimpleCollection %s' % (status, batch.pid)) success += 1 except Exception as e: logger.error('Failed to update %s : %s' % (pid, e)) error += 1 info = { 'success': success, 'error': error, 'success_plural': '' if success == 1 else 's', 'error_plural': '' if error == 1 else 's', 'status': status } summary_msg = "Successfully updated %(success)s item%(success_plural)s; error updating %(error)s" % info # if not all objects were updated correctly, exit with error if error > 0: raise Exception(summary_msg) # FIXME: this is based on the current form logic, but could leave # some member items stranded in a different status than the parent object batch.mods.content.create_restrictions_on_access() batch.mods.content.restrictions_on_access.text = status # Change collection status try: batch.save('Marking as %(status)s; updated %(success)s member item%(success_plural)s' % info) except Exception as e: save_err = "Error updating SimpleCollection %s - %s" % (obj.pid, e) logger.error(save_err) raise Exception('%s; %s' % (save_err, summary_msg)) # success return 'Successfully updated %(success)s item%(success_plural)s' % info
def index_data(self): '''Extend the default :meth:`eulfedora.models.DigitalObject.index_data` method to include additional fields specific to Keep Arrangement objects. Includes collection and archive information, along with arrangement id and access status.''' # NOTE: we don't want to rely on other objects being indexed in Solr, # so index data should not use Solr to find any related object info repo = Repository() # FIXME: use relation from current object instead # FIXME: is it worth splitting out descriptive index data here? data = super(ArrangementObject, self).index_data() if self.has_model(boda.EmailMessage.EMAIL_MESSAGE_CMODEL) or \ self.has_model(boda.Mailbox.MAILBOX_CONTENT_MODEL): data['object_type'] = 'email' # elif self.has_model(boda.RushdieFile.RUSHDIE_FILE_CMODEL): # data['object_type'] = 'file' else: # generic fallback data['object_type'] = 'born-digital' # Collection Info if self._deprecated_collection: collection = self._deprecated_collection elif self.collection: collection = self.collection else: collection = None if collection and collection.exists: # collection_source_id if collection.mods.content.source_id is not None: # allowed to be 0 data[ 'collection_source_id'] = collection.mods.content.source_id data['collection_id'] = collection.pid try: # pull parent & archive collection objects directly from fedora data['collection_label'] = collection.label # the parent collection of the collection this item belongs to is its archive # FIXME: this shouldn't be indexed here; are we actually # using it anywhere? # if collection.collection: # data['archive_id'] = collection.collection.uri # data['archive_label'] = collection.collection.label except RequestFailed as rf: logger.error( 'Error accessing collection or archive object in Fedora: %s' % rf) # Arrangement unique id try: if self.filetech.content.file: if self.filetech.content.file[0].local_id: data["arrangement_id"] = self.filetech.content.file[ 0].local_id if self.filetech.content.file[0].md5: data['content_md5'] = self.filetech.content.file[0].md5 except Exception as e: logging.error( "Error getting arrangement id or content MD5 for %s: %s" % self.pid, e) # rights access status code if self.rights.content.access_status: data['access_code'] = self.rights.content.access_status.code # normally this should be picked up via dc:rights, but arrangement # objects don't seem to have DC fields populated # NOTE: migrated items don't seem to have rights text set if self.rights.content.access_status.text: data['rights'] = self.rights.content.access_status.text # get simple collections that have an association with this object try: simple_collections = repo.risearch.get_subjects( relsext.hasMember, self.uriref) simple_collections = list(simple_collections) sc_ids = [] sc_labels = [] for sc in simple_collections: obj = repo.get_object(pid=sc, type=repo.infer_object_subtype) if isinstance(obj, SimpleCollection): sc_ids.append("info:fedora/%s" % obj.pid) sc_labels.append(obj.label) except RequestFailed as rf: logger.error('Error accessing simpleCollection in Fedora: %s' % rf) if sc_ids: data["simpleCollection_id"] = sc_ids if sc_labels: data["simpleCollection_label"] = sc_labels return data
class TestMigrateRushdie(TestCase): MM_FIXTURE = '''<macfs:document xmlns:macfs="info:fedora/emory-control:Rushdie-MacFsData-1.0"> <macfs:md5>ffcf48e5df673fc7de985e1b859eeeec</macfs:md5> <macfs:file> <macfs:computer>Performa 5400</macfs:computer> <macfs:path>/Hard Disk/MIDNIGHT'S CHILDREN/MISC. MATERIAL/x - the roles</macfs:path> <macfs:rawpath>L0hhcmQgRGlzay9NSUROSUdIVCdTIENISUxEUkVOL01JU0MuIE1BVEVSSUFML3ggLSB0aGUgcm9sZXM=</macfs:rawpath> <macfs:attributes>avbstclInmedz</macfs:attributes> <macfs:created>1997-01-19T19:29:32</macfs:created> <macfs:modified>1997-01-19T19:29:32</macfs:modified> <macfs:type>TEXT</macfs:type> <macfs:creator>ttxt</macfs:creator> </macfs:file> </macfs:document>''' MA_FIXTURE = '''<marbl:analysis xmlns:marbl="info:fedora/emory-control:Rushdie-MarblAnalysis-1.0"> <marbl:series>Writings by Rushdie</marbl:series> <marbl:subseries>Fiction</marbl:subseries> <marbl:verdict>As is</marbl:verdict> </marbl:analysis>''' SERIES_FIXTURE = { 'Writings by Rushdie': { 'series_info': { 'base_ark': 'http://testpid.library.emory.edu/ark:/25593/80mvk', 'id': 'rushdie1000_series2', 'short_id': 'series2', 'uri': 'https://findingaids.library.emory.edu/documents/rushdie1000/series2' }, 'subseries_info': { 'Fiction': { 'base_ark': 'http://testpid.library.emory.edu/ark:/25593/80mvk', 'id': 'rushdie1000_subseries2.1', 'short_id': 'subseries2.1', 'uri': 'https://findingaids.library.emory.edu/documents/rushdie1000/series2/subseries2.1' } } } } def setUp(self): self.repo = Repository() self.pids = [] #Create a simple Collection self.sc = self.repo.get_object(type=SimpleCollection) self.sc.label = "SimpleCollection For Test" self.sc.save() self.pids.append(self.sc.pid) #Create a Master Collection self.mc = self.repo.get_object(type=CollectionObject) self.mc.label = "MasterCollection For Test" self.mc.save() self.pids.append(self.mc.pid) #Create a a DigitalObject self.digObj = self.repo.get_object(type=RushdieArrangementFile) self.digObj.label = "Object For Test" self.digObj.save() self.pids.append(self.digObj.pid) self.digObj.api.addDatastream(self.digObj.pid, "MARBL-MACTECH", "MARBL-MACTECH", mimeType="application/xml", content=self.MM_FIXTURE) self.digObj.api.addDatastream(self.digObj.pid, "MARBL-ANALYSIS", "MARBL-ANALYSIS", mimeType="application/xml", content=self.MA_FIXTURE) #Remove Arrangement model so it can be added later relation = (self.digObj.uriref, modelns.hasModel, "info:fedora/emory-control:Arrangement-1.0") self.digObj.rels_ext.content.remove(relation) self.digObj.save() #Setup Command self.cmd = migrate_rushdie.Command() self.cmd.verbosity = 1 self.cmd.v_normal = 1 self.cmd.v_none = 0 self.cmd.simple_collection = self.sc self.cmd.stdout = sys.stdout self.cmd.CONTENT_MODELS = CONTENT_MODELS self.cmd.repo = self.repo def tearDown(self): for pid in self.pids: self.repo.purge_object(pid) def test__add_to_simple_collection(self): self.cmd._add_to_simple_collection(self.digObj) self.assertTrue( (self.sc.uriref, relsextns.hasMember, self.digObj.uriref) in self.sc.rels_ext.content, "%s shold be a member of the Simplecollection" % self.digObj.pid) def test__get_unique_objects(self): #duplicate pids are processed only once objs = self.cmd._get_unique_objects([self.digObj.pid, self.digObj.pid]) self.assertEqual(len(objs), 1, "No dup pids should be processed") def test__convert_ds(self): obj = self.cmd._convert_ds(self.digObj, self.mc, self.SERIES_FIXTURE, False) #Check all fields are moved over correctly #filetech self.assertEqual(obj.filetech.content.file[0].md5, "ffcf48e5df673fc7de985e1b859eeeec") self.assertEqual(obj.filetech.content.file[0].computer, "Performa 5400") self.assertEqual( obj.filetech.content.file[0].path, "/Hard Disk/MIDNIGHT'S CHILDREN/MISC. MATERIAL/x - the roles") self.assertEqual( obj.filetech.content.file[0].rawpath, "L0hhcmQgRGlzay9NSUROSUdIVCdTIENISUxEUkVOL01JU0MuIE1BVEVSSUFML3ggLSB0aGUgcm9sZXM=" ) self.assertEqual(obj.filetech.content.file[0].attributes, "avbstclInmedz") self.assertEqual(obj.filetech.content.file[0].created, "1997-01-19T19:29:32") self.assertEqual(obj.filetech.content.file[0].modified, "1997-01-19T19:29:32") self.assertEqual(obj.filetech.content.file[0].type, "TEXT") self.assertEqual(obj.filetech.content.file[0].creator, "ttxt") #MODS self.assertEqual(obj.mods.content.series.title, "Fiction") self.assertEqual( obj.mods.content.series.uri, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"] ["Fiction"]["uri"]) self.assertEqual( obj.mods.content.series.base_ark, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"] ["Fiction"]["base_ark"]) self.assertEqual( obj.mods.content.series.full_id, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"] ["Fiction"]["id"]) self.assertEqual( obj.mods.content.series.short_id, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"] ["Fiction"]["short_id"]) self.assertEqual(obj.mods.content.series.series.title, "Writings by Rushdie") self.assertEqual( obj.mods.content.series.series.uri, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["uri"]) self.assertEqual( obj.mods.content.series.series.base_ark, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"] ["base_ark"]) self.assertEqual( obj.mods.content.series.series.full_id, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["id"]) self.assertEqual( obj.mods.content.series.series.short_id, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"] ["short_id"]) #Rights self.assertEqual(obj.rights.content.access_status.code, "2") #RELS-EXT self.assertTrue( (obj.uriref, relsextns.isMemberOf, self.mc.uriref) in obj.rels_ext.content, "Object should have isMember relation to master collection") self.assertTrue( (obj.uriref, modelns.hasModel, URIRef("info:fedora/emory-control:ArrangementAccessAllowed-1.0")) in obj.rels_ext.content, "Object should have Allowed Content Model") #Label and DS self.assertEqual(obj.label, "x - the roles", "Label should be set to last part of path") self.assertEqual(obj.owner, "thekeep-project", "owner should be set to 'thekeep-project'") self.assertEqual(obj.dc.content.title, "x - the roles", "DC title should be set to last part of path") #DataStreams #have to reload obj from repository to get DS update obj = self.repo.get_object(pid=obj.pid, type=ArrangementObject) self.assertFalse("MARBL-MACTECH" in obj.ds_list, "MARBL-MACTECH should have been removed") self.assertFalse("MARBL-ANALYSIS" in obj.ds_list, "MARBL-ANALYSIS should have been removed") def test_missing_series_info(self): #Remove subseries info from lookup series = self.SERIES_FIXTURE.copy() del series["Writings by Rushdie"]["subseries_info"] obj = self.cmd._convert_ds(self.digObj, self.mc, self.SERIES_FIXTURE, False) self.assertEqual(obj.mods.content.series.title, "Fiction") self.assertEqual(obj.mods.content.series.series.title, "Writings by Rushdie")
def download_video(request, pid, type, extension=None): '''Serve out an video datastream for the fedora object specified by pid. Can be used to download original file or the access copy. :param pid: pid of the :class:`~keep.vidoe.models.Video` instance from which the vidoe datastream should be returned :param type: which video datastream to return - should be one of 'original' or 'access' :param extension: optional filename extension for access copy to distinguish between different types of access copies The :class:`django.http.HttpResponse` returned will have a Content-Disposition set to prompt the user to download the file with a filename based on the object noid and an appropriate file extension for the type of video requested. ''' repo = Repository(request=request) # retrieve the object so we can use it to set the download filename obj = repo.get_object(pid, type=Video) # user needs either *play* or *download* permissions # - could be any video or researcher-accessible only, which additionally # requires checking object is researcher-accessible # for now, use presence of 'HTTP_RANGE' in request to differentiate # jplayer requests from straight downloads # NOTE: this would not be too difficult for a savvy user to circumvent # (if they know what we are checking), but is intended mainly to prevent # unwanted access by staff and researchers in the reading room # if http range is present in request, check for play permissions # (also requires that request is for access copy, not original) if 'HTTP_RANGE' in request.META: playable = (type == 'access' and (request.user.has_perm('video.play_video')) or (request.user.has_perm('video.play_researcher_video') and bool(obj.researcher_access))) if not playable: return prompt_login_or_403(request) # otherwise, check for download permissions else: # user either needs download video permissions OR # if they can download researcher audio and object must be researcher-accessible downloadable = request.user.has_perm('video.download_video') or \ (request.user.has_perm('video.download_researcher_video') and bool(obj.researcher_access)) if not downloadable: return prompt_login_or_403(request) # determine which datastream is requsted & set datastream id & file extension if type == 'original': dsid = Video.content.id # set file extension based on the datastream content type, # with a fallback for generic binary (should not happen in production) file_ext = Video.allowed_master_mimetypes.get(obj.content.mimetype, 'bin') elif type == 'access': dsid = Video.access_copy.id # set file extension based on the datastream content file_ext = Video.allowed_access_mimetypes[obj.access_copy.mimetype] else: # any other type is not supported raise Http404 extra_headers = { 'Content-Disposition': 'attachment; filename="%s.%s"' % (obj.noid, file_ext) } # use generic raw datastream view from eulfedora return raw_datastream(request, pid, dsid, repo=repo, headers=extra_headers) # errors accessing Fedora will fall through to default 500 error handling
def edit(request, pid): '''Edit the metadata for a single :class:`~keep.file.models.DiskImage`.''' # FIXME: should be generic file (?) or possibly one of several supported files repo = Repository(request=request) obj = repo.get_object(pid, type=DiskImage) try: # if this is not actually a disk image, then 404 (object is not available at this url) if not obj.has_requisite_content_models: raise Http404 if request.method == 'POST': # if data has been submitted, initialize form with request data and object mods form = DiskImageEditForm(request.POST, instance=obj) if form.is_valid(): # includes schema validation # update foxml object with data from the form form.update_instance() if 'comment' in form.cleaned_data \ and form.cleaned_data['comment']: comment = form.cleaned_data['comment'] else: comment = "update metadata" obj.save(comment) messages.success(request, 'Successfully updated <a href="%s">%s</a>' % \ (reverse('file:edit', args=[pid]), pid)) # save & continue functionality - same as collection edit if '_save_continue' not in request.POST: return HttpResponseSeeOtherRedirect(reverse('repo-admin:dashboard')) # otherwise - fall through to display edit form again # form was posted but not valid else: # if we attempted to save and failed, add a message since the error # may not be obvious or visible in the first screenful of the form messages.error(request, '''Your changes were not saved due to a validation error. Please correct any required or invalid fields indicated below and save again.''') else: # GET - display the form for editing, pre-populated with content from the object form = DiskImageEditForm(instance=obj) class AdminOpts(object): app_label = 'file' model_name = 'application' # options for generating admin link to edit/add file application db info admin_fileapp = AdminOpts() return TemplateResponse(request, 'file/edit.html', {'obj': obj, 'form': form, 'admin_fileapp': admin_fileapp}) except PermissionDenied: # Fedora may return a PermissionDenied error when accessing a datastream # where the datastream does not exist, object does not exist, or user # does not have permission to access the datastream # check that the object exists - if not, 404 if not obj.exists: raise Http404 # for now, assuming that if object exists and has correct content models, # it will have all the datastreams required for this view return HttpResponseForbidden('Permission Denied to access %s' % pid, content_type='text/plain') except RequestFailed as rf: # if fedora actually returned a 404, propagate it if rf.code == 404: raise Http404 msg = 'There was an error contacting the digital repository. ' + \ 'This prevented us from accessing audio data. If this ' + \ 'problem persists, please alert the repository ' + \ 'administrator.' return HttpResponse(msg, content_type='text/plain', status=500)
def largefile_ingest(request): '''Large-file ingest. On GET, displays a form allowing user to select a BagIt that has been uploaded to the configured large-file ingest staging area for ingest and association with a collection. ''' # ingest content from upload staging area context = {} template_name = 'file/largefile_ingest.html' form = None # on POST, process the form and ingest if valid if request.method == 'POST': form = LargeFileIngestForm(request.POST) # if form is not valid, add to context for redisplay with errors if not form.is_valid(): context['form'] = form # otherwise, process the form else: repo = Repository(request=request) # Get collection & check for optional comment collection = repo.get_object(pid=form.cleaned_data['collection'], type=CollectionObject) # get user comment if any; default to a generic ingest comment comment = form.cleaned_data[ 'comment'] or 'initial repository ingest' bag = form.cleaned_data['bag'] # create dict with file info to add success/failure info file_info = {'label': os.path.basename(bag)} #assuming type of ingest from subdirectory type = bag.split('/')[-2] try: if type == 'diskimage': obj = DiskImage.init_from_bagit(bag, request) if type == 'video': obj = Video.init_from_bagit(bag, request) # set collection on ingest obj.collection = collection ## NOTE: Due to a bug in Fedora 3.4 with checksums and ## and file uri ingest, the content datastream checksum ## must be cleared before ingest; manually check it ## after ingest to confirm Fedora calculated what we expect. ## This work-around can be removed once we upgrade to Fedora 3.6 # store datastream checksum that would be sent to fedora checksum = obj.content.checksum obj._content_checksum = checksum # clear it out so Fedora can ingest without erroring obj.content.checksum = None # file URIs also used for supplemental files; needs # to be handled the same way as content datastream # - look for any supplementN datastreams, store checksum, and remove supplemental_checksums = {} for i in range(20): try: dsid = 'supplement%d' % i dsobj = getattr(obj, dsid) supplemental_checksums[dsid] = dsobj.checksum dsobj.checksum = None except AttributeError: # stop iterating - we have found last supplemental file break # same for access copy checksum on Video files if type == 'video': access_checksum = obj.access_copy.checksum obj.access_copy.checksum = None obj.save(comment) # remove the ingested bag from large-file staging area shutil.rmtree(bag) # re-init to allow checking fedora-calculated checksums on # supplemental datastreams if type == 'diskimage': obj = repo.get_object(obj.pid, type=DiskImage) elif type == 'video': obj = repo.get_object(obj.pid, type=Video) # if save succeded (no exceptions), set summary info for display file_info.update({ 'type': type, 'success': True, 'pid': obj.pid, 'url': obj.get_absolute_url(), 'checksum': obj.content.checksum }) if type == 'video': file_info['access_checksum'] = obj.access_copy.checksum # compare checksum generated by Fedora # (required because of file uri bug in fedora 3.4; # this can be removed once we upgrade to fedora 3.6+) checksum_errors = [] if obj.content.checksum != checksum: checksum_errors.append('content') for dsid, checksum in supplemental_checksums.iteritems(): dsobj = obj.getDatastreamObject(dsid) if dsobj.checksum != checksum: checksum_errors.append(dsid) if type == 'video' and obj.access_copy.checksum != access_checksum: checksum_errors.append('access_copy') if checksum_errors: message = 'Checksum mismatch%s detected on ' + \ '%s datastream%s; please contact a repository administrator.''' file_info['message'] = message % ( 'es' if len(checksum_errors) > 1 else '', ', '.join(checksum_errors), 's' if len(checksum_errors) > 1 else '') except bagit.BagValidationError as err: logger.error(err) file_info.update({ 'success': False, 'message': 'BagIt error: %s' % err }) # special case: detected as duplicate content except DuplicateContent as e: # mark as failed and generate message with links to records # NOTE: pid url is duplicated logic from web upload view... links = [] for pid in e.pids: # use fedora type-inferring logic with list of content models # pulled from solr results obj = repo.get_object(pid, type=repo.best_subtype_for_object( pid, e.pid_cmodels[pid])) # use appropriate object class to get the object url links.append('<a href="%s">%s</a>' % (obj.get_absolute_url(), pid)) msg = mark_safe('%s: %s' % (unicode(e), '; '.join(links))) file_info.update({'success': False, 'message': msg}) except Exception as err: logger.error('Error: %s' % err) file_info.update({'success': False, 'message': '%s' % err}) # report success/failure in the same format as web-upload ingest context['ingest_results'] = [file_info] # on GET display form to select item(s) for ingest # OR on completed valid form post files = large_file_uploads() if request.method == 'GET' or \ form is not None and form.is_valid(): if len(files): context['form'] = LargeFileIngestForm() else: # indicator that no files are available for ingest context['no_files'] = True return TemplateResponse(request, template_name, context)
def manage_supplements(request, pid): '''Manage supplemental file datastreams associated with a :class:`~keep.file.models.DiskImage`.''' repo = Repository(request=request) obj = repo.get_object(pid, type=DiskImage) if not obj.exists or not obj.has_requisite_content_models: raise Http404 # generate initial data from any existing supplemental datastreams initial_data = [] for s in obj.supplemental_content: initial_data.append({'dsid': s.id, 'label': s.label, 'file': DatastreamFile(obj.pid, s.id, s.label)}) # on get, just display the form if request.method == 'GET': formset = SupplementalFileFormSet(initial=initial_data) # on post, process the form and any updates/additions if request.method == 'POST': formset = SupplementalFileFormSet(request.POST, request.FILES, initial=initial_data) if formset.is_valid(): m = magic.Magic(mime=True) # NOTE: because we currently don't support re-ordering # or deletion, simply counting to keep track of datastream ids s_id = 0 modified = 0 added = 0 for file_info in formset.cleaned_data: # skip empty formset if not file_info: continue if file_info.get('dsid', None): ds = obj.getDatastreamObject(file_info['dsid'], dsobj_type=FileDatastreamObject) # ds = getattr(obj, file_info['dsid']) else: added += 1 ds = obj.getDatastreamObject('supplement%d' % s_id, dsobj_type=FileDatastreamObject) # only set if changed so datastream isModified is accurate if file_info['label'] != ds.label: ds.label = file_info['label'] # if this is an uploaded file, replace content and calculate mimetype, checksum if isinstance(file_info['file'], UploadedFile): filename = file_info['file'].temporary_file_path() mimetype = m.from_file(filename) mimetype, separator, options = mimetype.partition(';') ds.mimetype = mimetype ds.checksum = md5sum(filename) ds.content = file_info['file'] if ds.exists and ds.isModified(): modified += 1 s_id += 1 try: obj.save('updating supplemental files') # summarize number of changes, if any if added or modified: msg_add = 'added %d' % added if added else '' msg_update = 'updated %d' % modified if modified else '' msg = 'Successfully %s%s%s supplemental file%s' % \ (msg_add, ' and ' if added and modified else '', msg_update, 's' if (added + modified) != 1 else '') messages.success(request, msg) else: # possible for the form to be valid but not make any changes messages.info(request, 'No changes made to supplemental content') return HttpResponseSeeOtherRedirect(reverse('file:edit', args=[pid])) except Exception as e: logger.error('Error on supplemental file update: %s' % e) logger.debug("Error details:\n" + traceback.format_exc()) messages.error(request, unicode(e)) # for now, just redisplay the form with error message return TemplateResponse(request, 'file/supplemental_content.html', {'obj': obj, 'formset': formset})
class ArrangementObjectTest(KeepTestCase): def setUp(self): self.repo = Repository() self.pids = [] # create test collection coll = self.repo.get_object(type=CollectionObject) coll.pid = '%s:parent-1' % settings.FEDORA_PIDSPACE coll.mods.content.source_id = '12345' coll.save() self.pids.append(coll.pid) #create test arrangement object self.arr = self.repo.get_object(type=ArrangementObject) self.arr.pid = 'foo:1' self.arr.collection = coll def tearDown(self): for pid in self.pids: self.repo.purge_object(pid) @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface) def test_by_arrangement_id(self, mocksolr): # no match self.assertRaises(ObjectDoesNotExist, ArrangementObject.by_arrangement_id, 42) solr = mocksolr.return_value solr.query.assert_called_with( arrangement_id=42, content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) solr.query.return_value.field_limit.assert_called_with('pid') # too many matches solr.query.return_value.field_limit.return_value = [{ 'pid': 'pid:1' }, { 'pid': 'pid:2' }] self.assertRaises(MultipleObjectsReturned, ArrangementObject.by_arrangement_id, 42) # one match solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}] ao = ArrangementObject.by_arrangement_id(42) self.assert_(isinstance(ao, ArrangementObject)) # custom repo object mockrepo = Mock() ao = ArrangementObject.by_arrangement_id(42, mockrepo) mockrepo.get_object.assert_called_with('pid:1', type=ArrangementObject) def test_arrangement_status(self): obj = ArrangementObject(Mock()) obj.arrangement_status = 'processed' self.assertEqual('A', obj.state) self.assertEqual('processed', obj.arrangement_status) obj.arrangement_status = 'accessioned' self.assertEqual('I', obj.state) self.assertEqual('accessioned', obj.arrangement_status) value_error = None try: obj.arrangement_status = 'bogus' except ValueError: value_error = True self.assertTrue( value_error, 'attempting to assign an unknown status should raise a ValueError') def test_update_access_cmodel(self): obj = ArrangementObject(Mock()) # no status set - should be set to restricted obj._update_access_cmodel() self.assert_( (obj.uriref, modelns.hasModel, URIRef(ACCESS_RESTRICTED_CMODEL)) in obj.rels_ext.content) self.assert_( (obj.uriref, modelns.hasModel, URIRef(ACCESS_ALLOWED_CMODEL)) not in obj.rels_ext.content) # set to status code 2 = access allowed obj.rights.content.create_access_status() obj.rights.content.access_status.code = '2' obj._update_access_cmodel() self.assert_( (obj.uriref, modelns.hasModel, URIRef(ACCESS_RESTRICTED_CMODEL)) not in obj.rels_ext.content) self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_ALLOWED_CMODEL)) in obj.rels_ext.content) def test_index_data(self): idx_data = self.arr.index_data() self.assertEqual('born-digital', idx_data['object_type']) self.assertEqual(self.arr.pid, idx_data['pid']) self.assertIn(self.arr.owner, idx_data['owner']) self.assertEquals(self.arr.collection.pid, idx_data['collection_id']) self.assertEquals(self.arr.collection.mods.content.source_id, idx_data['collection_source_id']) # Test the update_ark_label method in the keep.common.fedora # Note that this test is a simplified version of keep.common.fedora:ArkPidDigitalObject.test_update_ark_label # The udpate_ark_label here is an overriden method that is more specifc, and is used on Arrangement objects @patch('keep.arrangement.models.pidman' ) # mock the pidman client (the API service) def test_update_ark_label(self, mockpidman): # Create a ArrangementObject arrangement_object = ArrangementObject(Mock()) # Set a pid on the object so that it could internally generate a noid etc. arrangement_object.pid = "test:1234" # Simulate when the object doesn't exist (or hasn't been saved) # By default it appears as if it doesn't exist arrangement_object.update_ark_label() # What we should expect is that the update_ark_label is not called on pidman # Also there shouldn't be any errors # Use the mock assertFalse to check if a method is called or not self.assertFalse(mockpidman.get_ark.called) # Mock when the object exists (returns True) # Note: Need to set the Mock on the class and not the object because # this (exists) is a property method with patch.object(ArrangementObject, 'exists', new=Mock(return_value=True)): arrangement_object.update_ark_label() self.assertFalse(mockpidman.get_ark.called) # Set the label before the object exists so we don't trigger API calls arrangement_object.dc.content.title = "testpid" with patch.object(ArrangementObject, 'exists', new=Mock(return_value=True)): mockpidman.get_ark.return_value = { "name": arrangement_object.dc.content.title } arrangement_object.update_ark_label() mockpidman.get_ark.assert_called_with( arrangement_object.noid ) # assert that it is called with a noid too self.assertFalse(mockpidman.update_ark.called) # When the label is different from that in Pidman mockpidman.get_ark.return_value = {"name": "another pid"} arrangement_object.update_ark_label() mockpidman.get_ark.assert_called_with( arrangement_object.noid ) # assert that it is called with a noid too mockpidman.update_ark.assert_called_with( noid=arrangement_object.noid, name=arrangement_object.dc.content.title) def test_set_premis_object(self): mockapi = Mock() arrangement_object = ArrangementObject(mockapi) arrangement_object.pid = "test:1234" arrangement_object.mods.content.ark = 'ark:/1234/987' # return empty iterator for original data to checksum mockapi.getDatastreamDissemination.return_value = [] with patch.object(arrangement_object, 'getDatastreamObject') as mockgetds: mockgetds.return_value.checksum = '123456789' mockgetds.return_value.mimetype = 'text/plain' arrangement_object.set_premis_object() self.assert_(arrangement_object.provenance.content.object) premis = arrangement_object.provenance.content # FIXME: placeholder tests for placeholder functionality, # should be updated to use ARK uri once that is implemented self.assertEqual('ark', premis.object.id_type) self.assertEqual(arrangement_object.mods.content.ark, premis.object.id) self.assertEqual('p:file', premis.object.type) self.assertEqual(0, premis.object.composition_level) self.assertEqual('MD5', premis.object.checksums[0].algorithm) self.assertEqual('123456789', premis.object.checksums[0].digest) # sha1 for an empty file empty_sha1 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' self.assertEqual('SHA-1', premis.object.checksums[1].algorithm) self.assertEqual(empty_sha1, premis.object.checksums[1].digest) # object format should be original mietype self.assertEqual('text/plain', premis.object.format.name) # generated premis should be valid self.assertTrue(premis.is_valid()) def test_identifier_change_event(self): mockapi = Mock() mockapi.username = '******' arrangement_object = ArrangementObject(mockapi) arrangement_object.pid = 'test:1234' arrangement_object.mods.content.ark = 'ark:/1234/987' # set object premis so we can validate mockapi.getDatastreamDissemination.return_value = [] with patch.object(arrangement_object, 'getDatastreamObject') as mockgetds: mockgetds.return_value.checksum = '123456789' mockgetds.return_value.mimetype = 'text/plain' arrangement_object.set_premis_object() arrangement_object.identifier_change_event('old-pid:1') premis = arrangement_object.provenance.content self.assertEqual(1, len(premis.events)) event = premis.events[0] self.assertEqual('UUID', event.id_type) # id should be set, we don't care what it is exactly self.assert_(event.id) self.assertEqual('identifier assignment', event.type) self.assertEqual('program="keep"; version="%s"' % __version__, event.detail) self.assertEqual('Pass', event.outcome) msg = 'Persistent identifier reassigned from %s to %s' % \ ('old-pid:1', arrangement_object.pid) self.assertEqual(msg, event.outcome_detail) self.assertEqual('fedora user', event.agent_type) self.assertEqual('fedoraAdmin', event.agent_id) # generated premis should be valid self.assertTrue(premis.is_valid())
def download_audio(request, pid, type, extension=None): '''Serve out an audio datastream for the fedora object specified by pid. Can be used to download original (WAV) audio file or the access copy (MP3). :param pid: pid of the :class:`~keep.audio.models.AudioObject` instance from which the audio datastream should be returned :param type: which audio datastream to return - should be one of 'original' or 'access' :param extension: optional filename extension for access copy to distinguish between different types of access copies (currently MP3 or M4A) The :class:`django.http.HttpResponse` returned will have a Content-Disposition set to prompt the user to download the file with a filename based on the object noid and an appropriate file extension for the type of audio requested. ''' repo = Repository(request=request) # retrieve the object so we can use it to set the download filename obj = repo.get_object(pid, type=AudioObject) # user needs either *play* or *download* permissions # - could be any audio or researcher-accessible only, which additionally # requires checking object is researcher-accessible # for now, use presence of 'HTTP_RANGE' in request to differentiate # jplayer requests from straight downloads # NOTE: this would not be too difficult for a savvy user to circumvent # (if they know what we are checking), but is intended mainly to prevent # unwanted access by staff and researchers in the reading room # if http range is present in request, check for play permissions # (also requires that request is for access copy, not original) if 'HTTP_RANGE' in request.META: if not (request.user.has_perm('audio.play_audio') and type == 'access') and \ not (request.user.has_perm('audio.play_researcher_audio') and \ bool(obj.researcher_access) and type == 'access'): return prompt_login_or_403(request) # otherwise, check for download permissions else: # user either needs download audio permissions OR # if they can download researcher audio and object must be researcher-accessible if not request.user.has_perm('audio.download_audio') and \ not (request.user.has_perm('audio.download_researcher_audio') and \ bool(obj.researcher_access)): return prompt_login_or_403(request) # determine which datastream is requsted & set datastream id & file extension if type == 'original': dsid = AudioObject.audio.id file_ext = 'wav' elif type == 'access': dsid = AudioObject.compressed_audio.id # make sure the requested file extension matches the datastream if (obj.compressed_audio.mimetype == 'audio/mp4' and \ extension != 'm4a') or \ (obj.compressed_audio.mimetype == 'audio/mpeg' and \ extension != 'mp3'): raise Http404 file_ext = extension else: # any other type is not supported raise Http404 extra_headers = { 'Content-Disposition': 'attachment; filename="%s.%s"' % (obj.noid, file_ext) } # use generic raw datastream view from eulfedora return raw_datastream(request, pid, dsid, repo=repo, headers=extra_headers)
class Command(BaseCommand): '''Read CSV file and creates (or adds to) a Simple Collection and associated ArrangementObjects with the SimpleCollection and the Master collection''' def get_password_option(option, opt, value, parser): setattr(parser.values, option.dest, getpass()) #Set up additional options option_list = BaseCommand.option_list + ( make_option( '--noact', '-n', action='store_true', dest='no-act', default=False, help= 'Does not create PIDs or ingest anything into Fedora. Only parses file and outputs results' ), make_option( '--add', '-a', action='store', dest='add', help= 'adds to the SimpleCollection specified by pid, does not create a new SimpleCollection' ), make_option('--username', '-u', dest='username', action='store', help='''Username to connect to fedora'''), make_option( '--password', dest='password', action='callback', callback=get_password_option, help='''Prompt for password required when username used'''), ) args = '<CSV file> <master collection pid> <new simple collection name>' help = __doc__ def _create_series_lookup(self): #series / subseries info self.series = {} #exist query params return_fields = ['eadid'] search_fields = {'eadid': 'rushdie1000'} queryset = Series.objects.also(*return_fields).filter(**search_fields) for s in queryset: #series info self.series[s.title] = {} self.series[s.title]['series_info'] = {} self.series[s.title]['series_info']['id'] = s.id self.series[s.title]['series_info']['short_id'] = s.short_id self.series[s.title]['series_info']['base_ark'] = s.eadid.url self.series[s.title]['series_info']['uri'] = "https://findingaids.library.emory.edu/documents/%s/%s" % \ (s.eadid.value, s.short_id) #subseries info if s.subseries: self.series[s.title]['subseries_info'] = {} for sub in s.subseries: self.series[s.title]['subseries_info'][sub.title] = {} self.series[s.title]['subseries_info'][ sub.title]['id'] = sub.id self.series[s.title]['subseries_info'][ sub.title]['short_id'] = sub.short_id self.series[s.title]['subseries_info'][ sub.title]['base_ark'] = s.eadid.url self.series[s.title]['subseries_info'][sub.title]['uri'] = "https://findingaids.library.emory.edu/documents/%s/%s/%s" % \ (s.eadid.value, s.short_id, sub.short_id) def _create_arrangement(self, row): #Account for unicode characters #Preserve unicode characters for raw path, #but remove unicode character for other mappings rawpath = base64.encodestring(row["filename"]) path = row["filename"] path = unicode(path, 'utf8') creator = row["creator"] creator = unicode(creator, 'utf8') # set values in filetech DS obj = self.repo.get_object(type=ArrangementObject) obj.label = path.rpartition('/')[2] obj.filetech.content.file.append(FileMasterTech_Base()) obj.filetech.content.file[0].local_id = row['id'] obj.filetech.content.file[0].md5 = row['checksum'] obj.filetech.content.file[0].computer = row['computer'] obj.filetech.content.file[0].path = path obj.filetech.content.file[0].rawpath = rawpath obj.filetech.content.file[0].attributes = row['attrib'] obj.filetech.content.file[0].created = row['created'] obj.filetech.content.file[0].modified = row['modified'] obj.filetech.content.file[0].creator = creator #map DC title obj.dc.content.title = path.rpartition('/')[2] #map default verdict of 10 "Undetermined" in rights DS obj.rights.content.create_access_status() obj.rights.content.access_status.code = "10" #map series in MODS #RecordType used to lookup series info rec_type = row["rec_type"] rec_type = rec_type.strip() if rec_type not in self.series: rec_type = None if rec_type is not None: obj.mods.content.create_series() obj.mods.content.series.title = rec_type obj.mods.content.series.uri = self.series[rec_type]["series_info"][ "uri"] obj.mods.content.series.base_ark = self.series[rec_type][ "series_info"]["base_ark"] obj.mods.content.series.full_id = self.series[rec_type][ "series_info"]["id"] obj.mods.content.series.short_id = self.series[rec_type][ "series_info"]["short_id"] else: if self.verbosity > self.v_none: self.stdout.write("Series %s not found\n" % row["rec_type"]) # set association to master collection relation = (obj.uriref, relsextns.isMemberOf, self.master_obj.uriref) obj.rels_ext.content.add(relation) if self.verbosity > self.v_normal: self.stdout.write( "Adding %s isMemberOf %s relation on ArrangementObject\n" % (obj.label, self.master_obj.pid)) #set state to inactive by default obj.state = "I" return obj def handle(self, *args, **options): #collect arrangement pids here to delete later if SimpleCollection fails to save self.arrangement_pids = [] self._create_series_lookup() #0 = none, 1 = normal, 2 = all self.v_none = 0 self.v_normal = 1 if 'verbosity' in options: self.verbosity = int(options['verbosity']) else: self.verbosity = self.v_normal #Create the repo repo_args = {} if options.get('username') is not None: repo_args['username'] = options.get('username') if options.get('password') is not None: repo_args['password'] = options.get('password') self.repo = Repository(**repo_args) #Check to make sure all args and options are present try: file = args[0] except IndexError: raise CommandError("No CSV file specified") try: self.master_pid = args[1] except IndexError: raise CommandError("No master collection pid specified") #if -a or --add is used the new SimpleCollection name is ignored try: if not options["add"]: self.simple_collection_name = args[2] else: self.simple_collection_pid = options["add"] except IndexError: raise CommandError( "An existing SimpleCollection pid must be specified with the -a option or \ a new SimpleCollection name must be specified as an argument") #If Master collection does not exist then raise an exception self.master_obj = self.repo.get_object(type=CollectionObject, pid=self.master_pid) if not self.master_obj.exists: raise CommandError("Master Collection %s does not exist" % (self.master_pid)) else: if self.verbosity > self.v_none: self.stdout.write("Using Master Collection: %s(%s)\n" % (self.master_obj.label, self.master_obj.pid)) #Get or create SimpleColletion object #TODO Not sure why I have to do a try block to prevent a 404 here when I don't in other places try: if options["add"]: simple_collection = self.repo.get_object( type=SimpleCollection, pid=self.simple_collection_pid) else: simple_collection = self.repo.get_object(type=SimpleCollection) simple_collection.label = self.simple_collection_name simple_collection.dc.content.title = self.simple_collection_name simple_collection.mods.content.create_restrictions_on_access() simple_collection.mods.content.restrictions_on_access.text = "Accessioned" except: raise CommandError("Pid %s does not exist" % self.simple_collection_pid) #try to read file into a dict and assign the field names try: reader = csv.DictReader(open(file, 'rb'), fieldnames=[ "id", "checksum", "filename", "rec_type", "file_type", "creator", "attrib", "created", "modified", "computer", "size" ]) if self.verbosity > self.v_none: self.stdout.write("Reading CSV: %s\n" % (file)) except IOError: raise CommandError("Could not read file %s" % file) # skip the header row in CSV file reader.next() #read each field csv_read = 0 arrangement_saved = 0 errors = 0 for row in reader: try: csv_read += 1 arrangement_object = self._create_arrangement(row) if not options['no-act']: try: arrangement_object.save() arrangement_saved += 1 self.arrangement_pids.append(arrangement_object.pid) if self.verbosity > self.v_none: self.stdout.write( "Saved ArrangementObject %s(%s)\n" % (arrangement_object.label, arrangement_object.pid)) except Exception as e: if self.verbosity > self.v_none: self.stdout.write( "Error saving ArrangementObject %s: %s\n" % (arrangement_object.label, e.message)) errors += 1 else: if self.verbosity > self.v_none: self.stdout.write("TEST ArrangementObject %s\n" % (arrangement_object.label)) if self.verbosity > self.v_normal: self.stdout.write("===RELS-EXT===\n") for entry in arrangement_object.rels_ext.content: self.stdout.write("%s\n" % list(entry)) self.stdout.write("===MODS===\n") self.stdout.write( "%s\n" % arrangement_object.mods.content.serialize()) #Add each ArrangementObject to the SimpleCollection relation = (simple_collection.uriref, relsextns.hasMember, arrangement_object.uriref) simple_collection.rels_ext.content.add(relation) if self.verbosity > self.v_normal: self.stdout.write( "Adding hasMember %s relation on SimpleCollection\n" % (arrangement_object.pid)) except Exception as e: self.stdout.write("Error in record id %s: %s\n" % (row["id"], e)) errors += 1 if not options['no-act']: try: simple_collection.save() self.stdout.write( "Saved SimpleCollection %s(%s)\n" % (simple_collection.label, simple_collection.pid)) except Exception as e: if self.verbosity > self.v_none: self.stdout.write( "Error saving SimpleCollection %s: %s\n" % (simple_collection.label, e.message)) self.stdout.write( "Deleting Arrangement pids so they will not be Orphans\n" ) errors += 1 for pid in self.arrangement_pids: self.repo.purge_object(pid) if self.verbosity > self.v_none: self.stdout.write("Deleting: %s\n" % (pid)) arrangement_saved -= 1 else: if self.verbosity > self.v_none: self.stdout.write("TEST SimpleCollection %s\n" % (simple_collection.label)) if self.verbosity > self.v_normal: self.stdout.write("===RELS-EXT===\n") for entry in simple_collection.rels_ext.content: self.stdout.write("%s\n" % list(entry)) self.stdout.write("===DC===\n") self.stdout.write("%s\n" % simple_collection.dc.content.serialize()) self.stdout.write("===MODS===\n") self.stdout.write("%s\n" % simple_collection.mods.content.serialize()) #print Summary self.stdout.write("\n\nSUMMARY\n=======\n") self.stdout.write("SimpleCollection: %s(%s)\n" % (simple_collection.label, simple_collection.pid)) self.stdout.write("Master Collection Object: %s(%s)\n" % (self.master_obj.label, self.master_obj.pid)) self.stdout.write("%s Records read from CSV file\n" % (csv_read)) self.stdout.write("%s Records created\n" % (arrangement_saved)) self.stdout.write("%s Errors\n" % (errors))
def migrate_aff_diskimage(self, pid): creating_application = 'AccessData FTK Imager' application_version = 'v3.1.1 CLI' migration_event_detail = 'program="%s"; version="%s"' % \ (creating_application, application_version) migration_event_outcome = 'AFF reformatted as E01 using command line ' + \ 'FTK program with settings: --e01 --compress 0 --frag 100T --quiet' # use the configured ingesting staging area as the base tmp dir # create # for all temporary files staging_dir = getattr(settings, 'LARGE_FILE_STAGING_DIR', None) # create a tempdir within the large file staging area tmpdir = tempfile.mkdtemp(suffix='-aff-migration', dir=staging_dir) logger.debug('Using tmpdir %s', tmpdir) # Retrieve the object to be migrated repo = Repository() original = repo.get_object(pid, type=DiskImage) # check object before migrating # - exists in fedora if not original.exists: # raise Exception raise Exception('%s not found in Fedora' % original.pid) # - is a disk image if not original.has_requisite_content_models: raise Exception('%s is not a DiskImage object' % original.pid) # - is an AFF disk image if original.provenance.content.object.format.name != 'AFF': raise Exception('%s DiskImage format is not AFF' % original.pid) # - has not already been migrated if original.migrated is not None: raise Exception('%s has already been migrated' % original.pid) # download the aff disk image to a tempfile aff_file = tempfile.NamedTemporaryFile(suffix='.aff', prefix='keep-%s_' % original.noid, dir=tmpdir, delete=False) logger.debug('Saving AFF as %s for conversion (datastream size: %s)' \ % (aff_file.name, filesizeformat(original.content.size))) try: for chunk in original.content.get_chunked_content(): aff_file.write(chunk) except Exception as err: raise Exception('Error downloading %s AFF for conversion' % original.pid) # close the file handle in case of weird interactions with ftkimager aff_file.close() aff_size = os.path.getsize(aff_file.name) logger.debug('Downloaded %s' % filesizeformat(aff_size)) # run ftkimager to generate the E01 version logger.debug('Running ftkimager to generate E01') e01_file = tempfile.NamedTemporaryFile(suffix='.E01', prefix='keep-%s_' % original.noid, dir=tmpdir, delete=False) # close the file handle in case of weird interactions with ftkimager e01_file.close() # file handle to capture console output from ftkimager ftk_output = tempfile.NamedTemporaryFile(suffix='.txt', prefix='keep-%s-ftkimager_' % original.noid, dir=tmpdir) logger.debug('E01 temp file is %s' % e01_file.name) logger.debug('ftkimager output temp file is %s' % ftk_output.name) # ftkimager adds .E01 to the specified filename, so pass in filename without e01_file_basename, ext = os.path.splitext(e01_file.name) convert_command = [ 'ftkimager', aff_file.name, e01_file_basename, '--e01', '--compress', '0', '--frag', '100T', '--quiet' ] # quiet simply suppresses progress output, which is not meaningful # in a captured text file logger.debug('conversion command is %s' % ' '.join(convert_command)) return_val = subprocess.call(convert_command, stdout=ftk_output, stderr=subprocess.STDOUT) logger.debug('ftkimager return value is %s' % return_val) ftk_detail_output = '%s.txt' % e01_file.name e01_size = os.path.getsize(e01_file.name) if e01_size == 0: raise Exception('Generated E01 file is 0 size') logger.info('Generated E01 (%s) from %s AFF (%s)' % \ (filesizeformat(e01_size), original.pid, filesizeformat(aff_size))) # use ftkimager to verify aff and e01 and compare checksums aff_checksums = ftkimager_verify(aff_file.name) if not aff_checksums: raise Exception('Error running ftkimager verify on AFF for %s' % original.pid) e01_checksums = ftkimager_verify(e01_file.name) if not e01_checksums: raise Exception('Error running ftkimager verify on E01 for %s' % original.pid) logger.debug('AFF verify checksums: %s' % \ ', '.join('%s: %s' % (k, v) for k, v in aff_checksums.iteritems())) logger.debug('E01 verify checksums: %s' % \ ', '.join('%s: %s' % (k, v) for k, v in e01_checksums.iteritems())) if aff_checksums != e01_checksums: raise Exception('AFF and E01 ftkimager verify checksums do not match') # create a new diskimage object from the file # - calculate file uri for content location e01_file_uri = fedora_file_uri(e01_file.name) logger.debug('E01 fedora file URI is %s', e01_file_uri) # change permissions on tmpdir + files to ensure fedora can access them os.chmod(tmpdir, 0775) os.chmod(e01_file.name, 0666) os.chmod(ftk_output.name, 0666) os.chmod(ftk_detail_output, 0666) migrated = DiskImage.init_from_file(e01_file.name, initial_label=original.label, content_location=e01_file_uri) # add ftkimager text output & details as supplemental files # - console output captured from subprocess call dsobj = migrated.getDatastreamObject('supplement0', dsobj_type=FileDatastreamObject) dsobj.label = 'ftkimager_output.txt' dsobj.mimetype = 'text/plain' dsobj.checksum = md5sum(ftk_output.name) logger.debug('Adding ftkimager console output as supplemental dastream %s label=%s mimetype=%s checksum=%s' % \ (dsobj.id, dsobj.label, dsobj.mimetype, dsobj.checksum)) dsobj.content = open(ftk_output.name).read() # - text file generated by ftkimager alongside the E01 dsobj2 = migrated.getDatastreamObject('supplement1', dsobj_type=FileDatastreamObject) dsobj2.label = 'ftkimager_summary.txt' dsobj2.mimetype = 'text/plain' dsobj2.checksum = md5sum(ftk_detail_output) logger.debug('Adding ftkimager summary as supplemental dastream %s label=%s mimetype=%s checksum=%s' % \ (dsobj2.id, dsobj2.label, dsobj2.mimetype, dsobj2.checksum)) dsobj2.content = open(ftk_detail_output).read() # set metadata based on original disk image # - associate with original migrated.original = original # copy over descriptive & rights metadata # - collection membership migrated.collection = original.collection # - mods title, covering dates, abstract migrated.mods.content.title = original.mods.content.title migrated.mods.content.abstract = original.mods.content.abstract migrated.mods.content.coveringdate_start = original.mods.content.coveringdate_start migrated.mods.content.coveringdate_end = original.mods.content.coveringdate_end # - entire rights datastream migrated.rights.content = original.rights.content ### Update generated premis to describe migration. premis_ds = migrated.provenance.content premis_ds.object.composition_level = 0 # these values are the same for all migrated AFFs premis_ds.object.create_creating_application() premis_ds.object.creating_application.name = creating_application premis_ds.object.creating_application.version = application_version premis_ds.object.creating_application.date = date.today() # add relationship to the original object rel = PremisRelationship(type='derivation') rel.subtype = 'has source' rel.related_object_type = 'ark' rel.related_object_id = original.mods.content.ark # relationship must also reference the migration event on the # original, which doesn't exist yet. Generate a migration event # id now to use for both migration_event_id = uuid.uuid1() rel.related_event_type = 'UUID' rel.related_event_id = migration_event_id premis_ds.object.relationships.append(rel) ## NOTE: Due to a Fedora bug with checksums and file uri ingest, ## content datastream checksum must be cleared out before ingest ## and manually checked after. # store datastream checksum that would be sent to fedora e01_checksum = migrated.content.checksum # clear it out so Fedora can ingest without erroring migrated.content.checksum = None # ingest try: migrated.save('Ingest migrated version of %s' % original.pid) logger.debug('Migrated object ingested as %s' % migrated.pid) except DuplicateContent as err: raise Exception('Duplicate content detected for %s: %s %s', original.pid, err, ', '.join(err.pids)) # would probably be good to catch other fedora errors # remove temporary files for tmpfilename in [ aff_file.name, e01_file.name, ftk_output.name, ftk_detail_output ]: os.remove(tmpfilename) # reinitialize migrated object, just to avoid any issues # with accessing ark uri for use in original object premis migrated = repo.get_object(migrated.pid, type=DiskImage) # verify checksum if migrated.content.checksum != e01_checksum: raise Exception('Checksum mismatch detected on E01 for %s', migrated.pid) # once migrated object has been ingested, # update original object with migration information # - add rels-ext reference to migrated object original.migrated = migrated # - update premis with migration event and relationship migration_event = PremisEvent() migration_event.id_type = 'UUID' migration_event.id = migration_event_id migration_event.type = 'migration' migration_event.date = datetime.now().isoformat() migration_event.detail = migration_event_detail migration_event.outcome = 'Pass' migration_event.outcome_detail = migration_event_outcome migration_event.agent_type = 'fedora user' migration_event.agent_id = repo.username # premis wants both source and outcome objects linked in the event link_source = PremisLinkingObject(id_type='ark') link_source.id = original.mods.content.ark link_source.role = 'source' link_outcome = PremisLinkingObject(id_type='ark') link_outcome.id = migrated.mods.content.ark link_outcome.role = 'outcome' migration_event.linked_objects.extend([link_source, link_outcome]) original.provenance.content.events.append(migration_event) # add relation to migrated object in to premis object rel = PremisRelationship(type='derivation') rel.subtype = 'is source of' rel.related_object_type = 'ark' rel.related_object_id = migrated.mods.content.ark rel.related_event_type = 'UUID' rel.related_event_id = migration_event.id original.provenance.content.object.relationships.append(rel) original.save() logger.debug('Original disk image updated with migration data') # remove aff migration temp dir and any remaining contents try: shutil.rmtree(tmpdir) except OSError: # tempdir removal could fail due to nfs files # wait a few seconds and try again time.sleep(3) try: shutil.rmtree(tmpdir) except OSError as os_err: logger.warning('Failed to remove tmpdir %s : %s', tmpdir, os_err) logger.info('Migrated %s AFF to %s E01' % (original.pid, migrated.pid)) return 'Migrated %s to %s' % (original.pid, migrated.pid)
def ingest_files(files, collection, comment, request): '''Ingest a dictionary of files as returned by :meth:`keep.files.forms.UploadForm.files_to_ingest`. Returns a dictionary reporting per-file ingest success or failure. :param files: dictionary of files to be ingested :param collection: :class:`~keep.collection.models.CollectionObject` that newly ingested objects should be associated with :param comment: save message for fedora ingest :param request: :class:`~django.http.HttpRequest`, to access Fedora and ingest new objects as the logged-in user. ''' # NOTE: using this structure for easy of display in django templates (e.g., regroup) results = [] m = magic.Magic(mime=True) for filename, label in files.iteritems(): file_info = {'label': label} # check if file is an allowed type # NOTE: for single-file upload, browser-set type is # available as UploadedFile.content_type - but since # browser mimetypes are unreliable, calculate anyway try: type = m.from_file(filename) except IOError: raise Exception( 'Uploaded file is no longer available for ingest; please try again.' ) type, separator, options = type.partition(';') if type not in allowed_upload_types(request.user): # store error for display on detailed result page file_info.update({ 'success': False, 'message': '''File type '%s' is not allowed''' % type }) # if not an allowed type, no further processing results.append(file_info) continue if collection is None: file_info.update({ 'success': False, 'message': '''Collection not selected''' }) results.append(file_info) continue # if there is an MD5 file (i.e., file was uploaded via ajax), # use the contents of that file as checksum if os.path.exists(filename + '.md5'): with open(filename + '.md5') as md5file: md5 = md5file.read() # otherwise, calculate the MD5 (single-file upload) else: md5 = md5sum(filename) # determine what type of object to initialize based on mimetype objtype = None for t in uploadable_objects: if type in t.allowed_mimetypes: objtype = t break # initialize a new object from the file obj = objtype.init_from_file(filename, initial_label=label, request=request, checksum=md5, mimetype=type) # set collection on ingest obj.collection = collection try: # NOTE: by sending a log message, we force Fedora to store an # audit trail entry for object creation, which doesn't happen otherwise obj.save(comment) file_info.update({ 'success': True, 'pid': obj.pid, 'url': obj.get_absolute_url(), 'checksum': md5 }) # if audio, needs an additional step: if objtype == AudioObject: # Start asynchronous task to convert audio for access # NOTE: not passing in user-upload file so that # celery can more easily be run on a separate server queue_access_copy(obj) # remove the file now that we have sucessfully ingested os.remove(filename) # NOTE: could remove MD5 file (if any) here, but MD5 files # should be small and will get cleaned up by the cron script # special case: detected as duplicate content except DuplicateContent as e: # mark as failed and generate message with links to records links = [] repo = Repository(request=request) for pid in e.pids: # use fedora type-inferring logic with list of content models # pulled from solr results obj = repo.get_object(pid, type=repo.best_subtype_for_object( pid, e.pid_cmodels[pid])) # use appropriate object class to get the object url links.append('<a href="%s">%s</a>' % (obj.get_absolute_url(), pid)) msg = mark_safe('%s: %s' % (unicode(e), '; '.join(links))) file_info.update({'success': False, 'message': msg}) except Exception as e: logger.error('Error ingesting %s: %s' % (filename, e)) logger.debug("Error details:\n" + traceback.format_exc()) file_info['success'] = False # check for Fedora-specific errors if isinstance(e, RequestFailed): if 'Checksum Mismatch' in e.detail: file_info['message'] = 'Ingest failed due to a checksum mismatch - ' + \ 'file may have been corrupted or incompletely uploaded to Fedora' else: file_info['message'] = 'Fedora error: ' + unicode(e) # non-fedora error else: file_info['message'] = 'Ingest failed: ' + unicode(e) finally: # no matter what happened, store results for reporting to user results.append(file_info) return results
class ArrangementObjectTest(KeepTestCase): def setUp(self): self.repo = Repository() self.pids = [] # create test collection coll = self.repo.get_object(type=CollectionObject) coll.pid = '%s:parent-1' % settings.FEDORA_PIDSPACE coll.mods.content.source_id = '12345' coll.save() self.pids.append(coll.pid) #create test arrangement object self.arr = self.repo.get_object(type=ArrangementObject) self.arr.pid = 'foo:1' self.arr.collection = coll def tearDown(self): for pid in self.pids: self.repo.purge_object(pid) @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface) def test_by_arrangement_id(self, mocksolr): # no match self.assertRaises(ObjectDoesNotExist, ArrangementObject.by_arrangement_id, 42) solr = mocksolr.return_value solr.query.assert_called_with(arrangement_id=42, content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) solr.query.return_value.field_limit.assert_called_with('pid') # too many matches solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}, {'pid': 'pid:2'}] self.assertRaises(MultipleObjectsReturned, ArrangementObject.by_arrangement_id, 42) # one match solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}] ao = ArrangementObject.by_arrangement_id(42) self.assert_(isinstance(ao, ArrangementObject)) # custom repo object mockrepo = Mock() ao = ArrangementObject.by_arrangement_id(42, mockrepo) mockrepo.get_object.assert_called_with('pid:1', type=ArrangementObject) def test_arrangement_status(self): obj = ArrangementObject(Mock()) obj.arrangement_status = 'processed' self.assertEqual('A', obj.state) self.assertEqual('processed', obj.arrangement_status) obj.arrangement_status = 'accessioned' self.assertEqual('I', obj.state) self.assertEqual('accessioned', obj.arrangement_status) value_error = None try: obj.arrangement_status = 'bogus' except ValueError: value_error = True self.assertTrue(value_error, 'attempting to assign an unknown status should raise a ValueError') def test_update_access_cmodel(self): obj = ArrangementObject(Mock()) # no status set - should be set to restricted obj._update_access_cmodel() self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_RESTRICTED_CMODEL)) in obj.rels_ext.content) self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_ALLOWED_CMODEL)) not in obj.rels_ext.content) # set to status code 2 = access allowed obj.rights.content.create_access_status() obj.rights.content.access_status.code = '2' obj._update_access_cmodel() self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_RESTRICTED_CMODEL)) not in obj.rels_ext.content) self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_ALLOWED_CMODEL)) in obj.rels_ext.content) def test_index_data(self): idx_data = self.arr.index_data() self.assertEqual('born-digital', idx_data['object_type']) self.assertEqual(self.arr.pid, idx_data['pid']) self.assertIn(self.arr.owner, idx_data['owner']) self.assertEquals(self.arr.collection.pid, idx_data['collection_id']) self.assertEquals(self.arr.collection.mods.content.source_id, idx_data['collection_source_id']) # Test the update_ark_label method in the keep.common.fedora # Note that this test is a simplified version of keep.common.fedora:ArkPidDigitalObject.test_update_ark_label # The udpate_ark_label here is an overriden method that is more specifc, and is used on Arrangement objects @patch('keep.arrangement.models.pidman') # mock the pidman client (the API service) def test_update_ark_label(self, mockpidman): # Create a ArrangementObject arrangement_object = ArrangementObject(Mock()) # Set a pid on the object so that it could internally generate a noid etc. arrangement_object.pid = "test:1234" # Simulate when the object doesn't exist (or hasn't been saved) # By default it appears as if it doesn't exist arrangement_object.update_ark_label() # What we should expect is that the update_ark_label is not called on pidman # Also there shouldn't be any errors # Use the mock assertFalse to check if a method is called or not self.assertFalse(mockpidman.get_ark.called) # Mock when the object exists (returns True) # Note: Need to set the Mock on the class and not the object because # this (exists) is a property method with patch.object(ArrangementObject, 'exists', new=Mock(return_value=True)): arrangement_object.update_ark_label() self.assertFalse(mockpidman.get_ark.called) # Set the label before the object exists so we don't trigger API calls arrangement_object.dc.content.title = "testpid" with patch.object(ArrangementObject, 'exists', new=Mock(return_value=True)): mockpidman.get_ark.return_value = {"name": arrangement_object.dc.content.title} arrangement_object.update_ark_label() mockpidman.get_ark.assert_called_with(arrangement_object.noid) # assert that it is called with a noid too self.assertFalse(mockpidman.update_ark.called) # When the label is different from that in Pidman mockpidman.get_ark.return_value = {"name": "another pid"} arrangement_object.update_ark_label() mockpidman.get_ark.assert_called_with(arrangement_object.noid) # assert that it is called with a noid too mockpidman.update_ark.assert_called_with(noid=arrangement_object.noid, name=arrangement_object.dc.content.title) def test_set_premis_object(self): mockapi = Mock() arrangement_object = ArrangementObject(mockapi) arrangement_object.pid = "test:1234" arrangement_object.mods.content.ark = 'ark:/1234/987' # return empty iterator for original data to checksum mockapi.getDatastreamDissemination.return_value = [] with patch.object(arrangement_object, 'getDatastreamObject') as mockgetds: mockgetds.return_value.checksum = '123456789' mockgetds.return_value.mimetype = 'text/plain' arrangement_object.set_premis_object() self.assert_(arrangement_object.provenance.content.object) premis = arrangement_object.provenance.content # FIXME: placeholder tests for placeholder functionality, # should be updated to use ARK uri once that is implemented self.assertEqual('ark', premis.object.id_type) self.assertEqual(arrangement_object.mods.content.ark, premis.object.id) self.assertEqual('p:file', premis.object.type) self.assertEqual(0, premis.object.composition_level) self.assertEqual('MD5', premis.object.checksums[0].algorithm) self.assertEqual('123456789', premis.object.checksums[0].digest) # sha1 for an empty file empty_sha1 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' self.assertEqual('SHA-1', premis.object.checksums[1].algorithm) self.assertEqual(empty_sha1, premis.object.checksums[1].digest) # object format should be original mietype self.assertEqual('text/plain', premis.object.format.name) # generated premis should be valid self.assertTrue(premis.is_valid()) def test_identifier_change_event(self): mockapi = Mock() mockapi.username = '******' arrangement_object = ArrangementObject(mockapi) arrangement_object.pid = 'test:1234' arrangement_object.mods.content.ark = 'ark:/1234/987' # set object premis so we can validate mockapi.getDatastreamDissemination.return_value = [] with patch.object(arrangement_object, 'getDatastreamObject') as mockgetds: mockgetds.return_value.checksum = '123456789' mockgetds.return_value.mimetype = 'text/plain' arrangement_object.set_premis_object() arrangement_object.identifier_change_event('old-pid:1') premis = arrangement_object.provenance.content self.assertEqual(1, len(premis.events)) event = premis.events[0] self.assertEqual('UUID', event.id_type) # id should be set, we don't care what it is exactly self.assert_(event.id) self.assertEqual('identifier assignment', event.type) self.assertEqual('program="keep"; version="%s"' % __version__, event.detail) self.assertEqual('Pass', event.outcome) msg = 'Persistent identifier reassigned from %s to %s' % \ ('old-pid:1', arrangement_object.pid) self.assertEqual(msg, event.outcome_detail) self.assertEqual('fedora user', event.agent_type) self.assertEqual('fedoraAdmin', event.agent_id) # generated premis should be valid self.assertTrue(premis.is_valid())
def get_numbering(self, pid): if pid in settings.PID_ALIASES: pid = settings.PID_ALIASES[pid] repo = Repository() return repo.get_object(pid, type=CollectionObject)
class EmailMessageTest(KeepTestCase): def setUp(self): self.repo = Repository() self.pids = [] # test EmailMessage self.email = self.repo.get_object(type=EmailMessage) self.email.cerp.content.from_list = ['*****@*****.**'] self.email.cerp.content.to_list = ['*****@*****.**'] self.email.cerp.content.subject_list = ['Interesting Subject'] def tearDown(self): for pid in self.pids: self.repo.purge_object(pid) def test_headers(self): h1 = cerp.Header() h1.name = "HEADER 1" h1.value = "value for header 1" h2 = cerp.Header() h2.name = "HEADER 2" h2.value = "value for header 2" self.email.cerp.content.headers.append(h1) self.email.cerp.content.headers.append(h2) self.assertEqual(self.email.headers['HEADER 1'], 'value for header 1') self.assertEqual(self.email.headers['HEADER 2'], 'value for header 2') def test_email_label(self): # no object label and one person in to field label = self.email.email_label() self.assertEqual('Email from [email protected] to [email protected] Interesting Subject', label, 'Should construct label when it does not exist') # more then one person in to list self.email.cerp.content.to_list.append('*****@*****.**') label = self.email.email_label() self.assertEqual('Email from [email protected] to [email protected] et al. Interesting Subject', label, 'only show first to email address when there are more than one') # no subject self.email.cerp.content.subject_list = [] self.assertEqual('Email from [email protected] to [email protected] et al.', self.email.email_label(), 'Display message without subject when no subject is present') # has a date date_header = cerp.Header() date_header.name = 'Date' date_header.value = 'Friday 13 200 13:00' self.email.cerp.content.headers.append(date_header) label = self.email.email_label() self.assertEqual('Email from [email protected] to [email protected] et al. on Friday 13 200 13:00', label, 'only show first to email address when there are more than one') # object label already exists self.email.label = "label we want to keep" label = self.email.email_label() self.assertEqual(self.email.label, label, 'label should be preserved when it exists') def test_index_data(self): # NOTE: logic for creating the label is in the label test # test to make sure label exists in index data data = self.email.index_data() self.assertIn('label', data.keys()) # mime_data does not exist, so no c self.assert_('content_md5' not in data, 'content_md5 should not be set when mime data does not exist') # patch mime data to test exists /cchecksum with patch.object(self.email, 'mime_data', Mock()) as mock_mime: mock_mime.exists = True mock_mime.checksum = 'test checksum value' data = self.email.index_data() self.assertEqual(self.email.mime_data.checksum, data['content_md5']) @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface) def test_by_checksum(self, mocksolr): # no match self.assertRaises(ObjectDoesNotExist, EmailMessage.by_checksum, 42) solr = mocksolr.return_value solr.query.assert_called_with(content_md5=42, content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) solr.query.return_value.field_limit.assert_called_with('pid') # too many matches solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}, {'pid': 'pid:2'}] self.assertRaises(MultipleObjectsReturned, EmailMessage.by_checksum, 42) # one match solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}] em = EmailMessage.by_checksum(42) self.assert_(isinstance(em, EmailMessage)) # custom repo object mockrepo = Mock() em = EmailMessage.by_checksum(42, mockrepo) mockrepo.get_object.assert_called_with('pid:1', type=EmailMessage) @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface) def test_by_message_id(self, mocksolr): # no match self.assertRaises(ObjectDoesNotExist, EmailMessage.by_message_id, '<*****@*****.**>') solr = mocksolr.return_value solr.query.assert_called_with(arrangement_id='<*****@*****.**>', content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) solr.query.return_value.field_limit.assert_called_with('pid')
def check_wav_mp3_duration(obj_pid=None, wav_file_path=None, mp3_file_path=None): '''Compare the durations of a wav file with an mp3 file (presumably an mp3 generated from the wav via :meth:`keep.audio.tasks.convert_wav_to_mp3` ) to check that they are roughly the same length. :param obj_pid: The pid of a fedora object (expected to be an AudioObject) to get the wav and/or mp3 files from if they are not specified by path. :param wav_file_path: Path to the wav_file to use for comparison; if not specified, it will be downloaded from the object in Fedora. :param mp3_file_path: Path to the mp3_file to use for comparison; if not specified, it will be downloaded from the object in Fedora. Note that this file must end in .mp3 for the duration to be calculated. :returns: True if the two files have the same duration, or close enough duration (no more than 1 second difference) ''' try: #Initialize temporary files to None. tmp_wav_path = None tmp_mp3_path = None #Initialize connection to the repository: repo = Repository() #Using the ingest directory to simplify cleanup in case extra files hang around. tempdir = settings.INGEST_STAGING_TEMP_DIR if not os.path.exists(tempdir): os.makedirs(tempdir) #If no wav file is specified, use the object. if wav_file_path is None: #Load the object. obj = repo.get_object(obj_pid, type=AudioObject) # download the compressed audio file from the object in fedora # mkstemp returns file descriptor and full path to the temp file tmp_fd_wav, tmp_wav_path = tempfile.mkstemp(dir=tempdir, suffix=".mp3") try: destination = os.fdopen(tmp_fd_wav, 'wb+') except Exception: os.close(tmp_fd_wav) raise try: destination.write(obj.audio.content.read()) except Exception: raise finally: # NOTE: This automatically closes the open tmpfd via Python magic; # calling os.close(tmpfd) at this point will error. destination.close() #Else use the passed in wav file. else: tmp_wav_path = wav_file_path #If no mp3 file is specified, use the object. if mp3_file_path is None: #Load the object. obj = repo.get_object(obj_pid, type=AudioObject) #Verify the compressed datastream exists, if not, return false as cannot match. if (not obj.compressed_audio.exists): return False # download the master audio file from the object in fedora # mkstemp returns file descriptor and full path to the temp file tmp_fd_mp3, tmp_mp3_path = tempfile.mkstemp(dir=tempdir, suffix=".mp3") try: destination = os.fdopen(tmp_fd_mp3, 'wb+') except Exception: os.close(tmp_fd_mp3) raise try: destination.write(obj.compressed_audio.content.read()) # just pass any exceptions up the chain finally: # NOTE: This automatically closes the open tmpfd via Python magic; # calling os.close(tmpfd) at this point will error. destination.close() #Else use the passed in wav file. else: tmp_mp3_path = mp3_file_path #Get information on the mp3 file using mutagen: mp3_tags = mutagen.File(tmp_mp3_path) if mp3_tags is None: raise Exception( 'Could not get MP3 tag information for MP3 file %s' % tmp_mp3_path) mp3_length = mp3_tags.info.length wav_length = wav_duration(tmp_wav_path) # Verify the wav file and the mp3 file have the same duration, # within the configured allowed discrepancy # - use a default value so this doesn't fail when not configured allowed_discrepancy = getattr(settings, 'AUDIO_ALLOWED_DURATION_DISCREPANCY', 1.0) return (math.fabs(mp3_length - wav_length) < allowed_discrepancy) except Exception: raise #Cleanup for everything. finally: # Only remove wav if file was not passed in (ie. only remove the temporary file). if wav_file_path is None and tmp_wav_path is not None: if os.path.exists(tmp_wav_path): os.remove(tmp_wav_path) # Only remove mp3 if file was not passed in (ie. only remove the temporary file). if mp3_file_path is None and tmp_mp3_path is not None: if os.path.exists(tmp_mp3_path): os.remove(tmp_mp3_path)