Beispiel #1
0
    def handle(self, *args, **options):
        self.options = options
        self.repaired_count = 0
        self.unrepaired_count = 0

        repo = Repository()
        self.pidman = DjangoPidmanRestClient()

        # populate list of objects to be processed
        objects = []
        for pid in args:
            try:
                obj = repo.get_object(pid=pid, type=CollectionObject)
                if obj.has_requisite_content_models:
                    objects.append(obj)
                else:
                    obj = repo.get_object(pid=pid, type=AudioObject)
                    if obj.has_requisite_content_models:
                        objects.append(obj)
            except Exception:
                self.log(message="Could not find Collection or Audio object for: %s" % pid)

        # get list of all collections from the repository
        # limited to the COLLECTION_CONTENT_MODEL as well as returns a Keep specific collection object
        if not args:
            objects = repo.get_objects_with_cmodel(CollectionObject.COLLECTION_CONTENT_MODEL, type=CollectionObject)

        if not objects:
            self.log(message="No Collections were found.")

        for obj in objects:
            self.repair_ark(obj)

        self.log(message="\n\n%s ARKs repaired\n%s ARKs were not repaired" % (self.repaired_count, self.unrepaired_count), no_label=True)
Beispiel #2
0
 def englishdocs_collection():
     repo = Repository()
     obj = repo.get_object(type=CollectionObject)
     obj.label = 'English documents collection'
     obj.mods.content.title = 'English documents collection'
     obj.mods.content.source_id = '309'
     obj.collection = repo.get_object(FedoraFixtures.archives()[1].uri)
     obj.mods.content.create_origin_info()
     obj.mods.content.origin_info.created.append(mods.DateCreated(date=1509, point='start'))
     obj.mods.content.origin_info.created.append(mods.DateCreated(date=1805, point='end'))
     return obj
Beispiel #3
0
 def rushdie_collection():
     repo = Repository()
     obj = repo.get_object(type=CollectionObject)
     obj.label = 'Salman Rushdie Collection'
     obj.mods.content.title = 'Salman Rushdie Collection'
     obj.mods.content.source_id = '1000'
     obj.collection = repo.get_object(FedoraFixtures.archives()[1].uri)
     obj.mods.content.create_origin_info()
     obj.mods.content.origin_info.created.append(mods.DateCreated(date=1947, point='start'))
     obj.mods.content.origin_info.created.append(mods.DateCreated(date=2008, point='end'))
     obj.mods.content.create_name()
     obj.mods.content.name.name_parts.append(mods.NamePart(text='Salman Rushdie'))
     return obj
Beispiel #4
0
 def esterbrook_collection():
     repo = Repository()
     obj = repo.get_object(type=CollectionObject)
     obj.label = 'Thomas Esterbrook letter books'
     obj.mods.content.title = 'Thomas Esterbrook letter books'
     obj.mods.content.source_id = '123'
     obj.collection = repo.get_object(FedoraFixtures.archives()[2].uri)
     obj.mods.content.create_origin_info()
     obj.mods.content.origin_info.created.append(mods.DateCreated(date=1855, point='start'))
     obj.mods.content.origin_info.created.append(mods.DateCreated(date=1861, point='end'))
     obj.mods.content.create_name()
     obj.mods.content.name.name_parts.append(mods.NamePart(text='Thomas Esterbrook'))
     return obj
Beispiel #5
0
def view(request, pid):
    '''View a single :class:`~keep.video.models.Video`.
    User must either have general view video permissions, or if they have
    view researcher view, the object must be researcher accessible
    (based on rights codes).
    '''
    repo = Repository(request=request)
    obj = repo.get_object(pid=pid, type=Video)
    # # user either needs view video permissions OR
    # # if they can view researcher audio and object must be researcher-accessible

    viewable = request.user.has_perm('video.view_video') or \
        (request.user.has_perm('video.view_researcher_video') and
         bool(obj.researcher_access))

    if not viewable:
        return prompt_login_or_403(request)

    try:
        if not obj.has_requisite_content_models:
            raise Http404
    except:
        raise Http404


    return render(request, 'video/view.html', {"resource": obj})
class Command(BaseCommand):
    '''Generate access copies for PIDs specified on the command line.'''
    help = __doc__

    def handle(self, *args, **options):
        self.verbosity = options['verbosity']
        self.repo = Repository()

        for pid in args:
            self.process_pid(pid)

    def process_pid(self, pid):
        '''Process a single PID by looking it up in the repository, figuring
        out what kind of processing it needs based on its object type, and
        doing that.
        '''

        obj = self.repo.get_object(pid=pid, type=self.repo.infer_object_subtype)
        if not obj.exists:
            if self.verbosity >= 1:
                print "No such PID; skipped:", pid
                return

        if isinstance(obj, AudioObject):
            if self.verbosity >= 2:
                print "Generating audio access copy:", pid
            queue_access_copy(obj)
        else:
            if self.verbosity >= 1:
                print "Unhandled  object type; skipped:", pid
Beispiel #7
0
 def archives(format=None):
     if format == dict:
         return [{'title': nick, 'pid': pid}
                 for nick,pid in settings.PID_ALIASES.iteritems()]
         
     if not hasattr(FedoraFixtures, '_archives'):
         repo = Repository()
         FedoraFixtures._archives = [repo.get_object(pid, type=CollectionObject)
                                     for pid in settings.PID_ALIASES.itervalues()]
     return FedoraFixtures._archives
Beispiel #8
0
def download(request, pid):
    'Download disk image datastream contents'
    repo = Repository(request=request)
    obj = repo.get_object(pid, type=DiskImage)
    extra_headers = {
        'Content-Disposition': "attachment; filename=%s.%s" % \
            (obj.noid, obj.provenance.content.object.latest_format.name)
    }
    return raw_datastream(request, pid, DiskImage.content.id,
        repo=repo, headers=extra_headers)
Beispiel #9
0
 def simple_collection(label=None, status=None, pid=None):
     repo = Repository()
     obj = repo.get_object(type=SimpleCollection)
     if label is not None:
         obj.label = label
     obj.mods.content.create_restrictions_on_access()
     if status is not None:
         obj.mods.content.restrictions_on_access.text = status
     if pid is not None:
         obj.pid = pid
     return obj
Beispiel #10
0
 def simple_collection(label=None, status=None, pid=None):
     repo = Repository()
     obj = repo.get_object(type=SimpleCollection)
     if label is not None:
         obj.label = label
     obj.mods.content.create_restrictions_on_access()
     if status is not None:
         obj.mods.content.restrictions_on_access.text = status
     if pid is not None:
         obj.pid = pid
     return obj
Beispiel #11
0
    def handle(self, *args, **options):
        self.options = options
        self.repaired_count = 0
        self.unrepaired_count = 0

        repo = Repository()
        self.pidman = DjangoPidmanRestClient()

        # populate list of objects to be processed
        objects = []
        for pid in args:
            try:
                obj = repo.get_object(pid=pid, type=CollectionObject)
                if obj.has_requisite_content_models:
                    objects.append(obj)
                else:
                    obj = repo.get_object(pid=pid, type=AudioObject)
                    if obj.has_requisite_content_models:
                        objects.append(obj)
            except Exception:
                self.log(
                    message="Could not find Collection or Audio object for: %s"
                    % pid)

        # get list of all collections from the repository
        # limited to the COLLECTION_CONTENT_MODEL as well as returns a Keep specific collection object
        if not args:
            objects = repo.get_objects_with_cmodel(
                CollectionObject.COLLECTION_CONTENT_MODEL,
                type=CollectionObject)

        if not objects:
            self.log(message="No Collections were found.")

        for obj in objects:
            self.repair_ark(obj)

        self.log(message="\n\n%s ARKs repaired\n%s ARKs were not repaired" %
                 (self.repaired_count, self.unrepaired_count),
                 no_label=True)
Beispiel #12
0
def _objects_by_type(type_uri, type=None):
    """
    Returns a list of objects with the specified type_uri as objects of the specified type
    :param type_uri: The uri of the type being searched
    :param type: The type of object that should be returned
    """
    repo = Repository()

    pids = repo.risearch.get_subjects(RDF.type, type_uri)
    pids_list = list(pids)

    for pid in pids_list:
        yield repo.get_object(pid=pid, type=type)
Beispiel #13
0
def download(request, pid):
    'Download disk image datastream contents'
    repo = Repository(request=request)
    obj = repo.get_object(pid, type=DiskImage)
    extra_headers = {
        'Content-Disposition': "attachment; filename=%s.%s" % \
            (obj.noid, obj.provenance.content.object.latest_format.name)
    }
    return raw_datastream(request,
                          pid,
                          DiskImage.content.id,
                          repo=repo,
                          headers=extra_headers)
Beispiel #14
0
    def archives(format=None):
        if format == dict:
            return [{
                'title': nick,
                'pid': pid
            } for nick, pid in settings.PID_ALIASES.iteritems()]

        if not hasattr(FedoraFixtures, '_archives'):
            repo = Repository()
            FedoraFixtures._archives = [
                repo.get_object(pid, type=CollectionObject)
                for pid in settings.PID_ALIASES.itervalues()
            ]
        return FedoraFixtures._archives
Beispiel #15
0
def tasks(request, pid):
    '''Manage tasks associated with an :class:`~keep.audio.models.AudioObject`.
    Currently, the only supported functionality is to queue access
    copy conversion; this should be done by POSTing the type of task to
    be queued, i.e. **generate access copy**.

    Supported tasks:

        * **generate access copy** - queue access copy conversion for an audio
            item by pid.  Returns a status message as the body of a plain/text response

    :param pid: the pid of the object for which tasks should be queued

    '''
    if request.method == 'POST':
        status = "queued"
        task_type = request.POST.get('task', None)

        # TODO May want to prevent queuing of more than one at a time or within a time period.
        # TODO For now javascript disables the link until the page is refreshed.

        # currently the only supported task is
        if task_type == 'generate access copy':
            try:
                repo = Repository(request=request)
                obj = repo.get_object(pid, type=AudioObject)

                # if object doesn't exist or isn't an audio item, 404
                if not obj.exists or not obj.has_requisite_content_models:
                    raise Http404

                queue_access_copy(obj)
                status = 'Successfully queued access copy conversion'

            except Exception as err:
                # re-raise any 404 error
                if isinstance(err, Http404):
                    raise

                logger.error('Error queueing access copy conversion for %s : %s' % \
                    (pid, err))
                status = 'Error queueing access copy conversion (%s)' % err

            return HttpResponse(status, content_type='text/plain')

        # unsupported task
        else:
            return HttpResponse('Task "%s" is not supported' % task_type,
                                content_type='text/plain',
                                status=500)
Beispiel #16
0
def create_from_findingaid(request):
    form = FindCollection(request.POST)
    if not form.is_valid():
        messages.error(request, 'Form is not valid; please try again.')
    else:
        data = form.cleaned_data
        q = CollectionObject.item_collection_query()
        # submitted value is pid alias; lookup pid for solr query
        archive_id = settings.PID_ALIASES[data['archive']]
        q = q.query(archive_id=archive_id,
                    source_id=data['collection'])
        # if collection is found, redirect to collection view with message
        if q.count():
            messages.info(request, 'Found %d collection%s for %s %s.' %
                          (q.count(), 's' if q.count() != 1 else '',
                           data['archive'].upper(), data['collection']))
            return HttpResponseSeeOtherRedirect(reverse('collection:view',
                kwargs={'pid': q[0]['pid']}))

        else:
            # otherwise, create the new record and redirect to new
            # collection edit page
            repo = Repository(request=request)
            coll_id = data['collection']
            coll = None
            try:
                archive = repo.get_object(archive_id, type=CollectionObject)
                fa = FindingAid.find_by_unitid(unicode(coll_id),
                                               archive.mods.content.title)
                coll = fa.generate_collection()
                coll.collection = archive
                coll.save()
                messages.info(request, 'Added %s for collection %s: %s'
                              % (coll, coll_id, coll.mods.content.title))

                return HttpResponseSeeOtherRedirect(
                    reverse('collection:edit', kwargs={'pid': coll.pid}))

            except DoesNotExist:
                messages.error(request, 'No EAD found for %s in %s' %
                               (coll_id, data['archive'].upper()))
            except ReturnedMultiple:
                messages.error(request, 'Multiple EADs found for %s in %s' %
                               (coll_id, data['archive'].upper()))
            except RequestFailed as err:
                print err
                messages.error(request, 'Failed to save new collection')

    return HttpResponseSeeOtherRedirect(reverse('repo-admin:dashboard'))
Beispiel #17
0
def tasks(request, pid):
    '''Manage tasks associated with an :class:`~keep.audio.models.AudioObject`.
    Currently, the only supported functionality is to queue access
    copy conversion; this should be done by POSTing the type of task to
    be queued, i.e. **generate access copy**.

    Supported tasks:

        * **generate access copy** - queue access copy conversion for an audio
            item by pid.  Returns a status message as the body of a plain/text response

    :param pid: the pid of the object for which tasks should be queued

    '''
    if request.method == 'POST':
        status = "queued"
        task_type = request.POST.get('task', None)

        # TODO May want to prevent queuing of more than one at a time or within a time period.
        # TODO For now javascript disables the link until the page is refreshed.

        # currently the only supported task is
        if task_type == 'generate access copy':
            try:
                repo = Repository(request=request)
                obj = repo.get_object(pid, type=AudioObject)

                # if object doesn't exist or isn't an audio item, 404
                if not obj.exists or not obj.has_requisite_content_models:
                    raise Http404

                queue_access_copy(obj)
                status = 'Successfully queued access copy conversion'

            except Exception as err:
                # re-raise any 404 error
                if isinstance(err, Http404):
                    raise

                logger.error('Error queueing access copy conversion for %s : %s' % \
                    (pid, err))
                status = 'Error queueing access copy conversion (%s)' % err

            return HttpResponse(status, content_type='text/plain')

        # unsupported task
        else:
            return HttpResponse('Task "%s" is not supported' % task_type,
                content_type='text/plain', status=500)
Beispiel #18
0
    def init_from_file(filename,
                       initial_label=None,
                       request=None,
                       checksum=None,
                       mimetype=None):
        '''Static method to create a new :class:`AudioObject` instance from
        a file.  Sets the object label and metadata title based on the initial
        label specified, or file basename.  Calculates and stores the duration
        based on the file. Also sets the following default metadata values:

            * mods:typeOfResource = "sound recording"
            * dt:codecQuality = "lossless"

        :param filename: full path to the audio file, as a string
        :param initial_label: optional initial label to use; if not specified,
            the base name of the specified file will be used
        :param request: :class:`django.http.HttpRequest` passed into a view method;
            must be passed in order to connect to Fedora as the currently-logged
            in user
        :param checksum: the checksum of the file being sent to fedora.
        :returns: :class:`AudioObject` initialized from the file
        '''
        if initial_label is None:
            initial_label = os.path.basename(filename)
        repo = Repository(request=request)
        obj = repo.get_object(type=AudioObject)
        # set initial object label from the base filename
        obj.label = initial_label
        obj.dc.content.title = obj.mods.content.title = obj.label
        obj.audio.content = open(
            filename)  # FIXME: at what point does/should this get closed?
        # Set the file checksum, if set.
        obj.audio.checksum = checksum
        # set content datastream mimetype if passed in
        if mimetype is not None:
            obj.audio.mimetype = mimetype
        #Get the label, minus the ".wav" (mimetype indicates that)
        obj.audio.label = initial_label[:-4]
        # set initial mods:typeOfResource - all AudioObjects default to sound recording
        obj.mods.content.resource_type = 'sound recording'
        # set codec quality to lossless in digital tech metadata
        # - default for AudioObjects, should only accept lossless audio for master file
        obj.digitaltech.content.codec_quality = 'lossless'
        # get wav duration and store in digital tech metadata
        obj.digitaltech.content.duration = '%d' % round(wav_duration(filename))

        return obj
Beispiel #19
0
    def disk_images(self):
        self.stderr.write('Disk images')
        ### disk images
        # representative sample of aff and ad1
        # DO NOT include anything in these collections:
        # Trethewey (ghsdj), Rushdie (94k9k), Mackey (g1btw),
        # Clifton (94kf4), and Grennan (9k0st)

        solr = solr_interface()
        repo = Repository()
        q = solr.query(content_model=DiskImage.DISKIMAGE_CONTENT_MODEL) \
                .exclude(collection_id=self.collections['trethewey']) \
                .exclude(collection_id=self.collections['rushdie']) \
                .exclude(collection_id=self.collections['mackey']) \
                .exclude(collection_id=self.collections['clifton']) \
                .exclude(collection_id=self.collections['grennan']) \
                .field_limit('pid')
        if self.verbosity >= self.v_normal:
            self.stderr.write(
                'Found %d disk images not in restricted collections' %
                q.count())

        # currently there is no way to filter on format or size in either
        # solr or fedora risearch
        # so, go through individually and group them by type,
        # then sort by size and pick the smallest ones
        diskimgs_by_type = defaultdict(list)
        for result in q:
            diskimg = repo.get_object(result['pid'], type=DiskImage)
            if not diskimg.exists:
                if self.verbosity >= self.v_normal:
                    self.stderr.write('Referenced disk image %s does not exist or is inaccessible' \
                        % result['pid'])
                continue

            fmt = diskimg.provenance.content.object.format.name
            diskimgs_by_type[fmt].append(diskimg)

        for fmt, diskimages in diskimgs_by_type.iteritems():
            if self.verbosity >= self.v_normal:
                self.stderr.write('Selecting %s disk images' % fmt)
            # sort on binary file size so we sync the smallest ones
            diskimages = sorted(diskimages,
                                key=lambda diskimg: diskimg.content.size)
            # use the first 10 of each type
            for d in diskimages[:10]:
                self.stdout.write(d.pid)
Beispiel #20
0
def view(request, pid):
    '''View a single :class:`~keep.collection.models.CollectionObject`,
    with a paginated list of all items in that collection.
    '''
    repo = Repository(request=request)
    obj = repo.get_object(pid, type=CollectionObject)
    # if pid doesn't exist or isn't a collection, 404
    if not obj.exists or not obj.has_requisite_content_models:
        raise Http404

    # search for all items that belong to this collection
    q = obj.solr_items_query()
    q = q.sort_by('date_created') \
         .sort_by('date_issued') \
         .sort_by('title_exact')
    # filter by logged-in user permissions
    # (includes researcher-accessible content filter when appropriate)
    q = filter_by_perms(q, request.user)

    # if current user can only view researcher-accesible collections and
    # no items were found, they don't have permission to view this collection
    if not request.user.has_perm('collection.view_collection') and \
           request.user.has_perm('collection.view_researcher_collection') and \
           q.count() == 0:
       return prompt_login_or_403(request)

    # paginate the solr result set
    paginator = Paginator(q, 30)
    try:
        page = int(request.GET.get('page', '1'))
    except ValueError:
        page = 1
    try:
        results = paginator.page(page)
    except (EmptyPage, InvalidPage):
        results = paginator.page(paginator.num_pages)

    # url parameters for pagination links
    url_params = request.GET.copy()
    if 'page' in url_params:
        del url_params['page']

    return TemplateResponse(request, 'collection/view.html',
        {'collection': obj, 'items': results,
         'url_params': urlencode(url_params)})
Beispiel #21
0
    def find_by_field(field, value, repo=None):
        '''
        Static method to find a single :class:`EmailMessage` by an indexed
        value.  Looks for the item in Solr and
        returns an :class:`EmailMessage` instance initialized
        from the repository if a single match is found for the
        requested field and value.

        Raises :class:`django.core.exceptions.MultipleObjectsReturned`
        if more than one match is found; raises
        :class:`django.core.exceptions.ObjectDoesNotExist` if no
        matches are found in the Solr index.

        :param field: solr field to search
        :param value: value to search on in the specified field

        :param repo: optional :class:`eulfedora.server.Repository`
            to use an existing connection with specific credentials

        :returns: :class:`EmailMessage`


        '''
        solr = solr_interface()
        search_terms = {
            field: value,
            'content_model': ArrangementObject.ARRANGEMENT_CONTENT_MODEL
        }
        q = solr.query(**search_terms).field_limit('pid')

        # check that we found one and only one
        found = len(q)
        # borrowing custom django exceptions for not found / too many
        # matches
        if found > 1:
            raise MultipleObjectsReturned('Found %d records with %s %s' % \
                                          (found, field, value))
        if not found:
            raise ObjectDoesNotExist('No record found with %s %s' %
                                     (field, value))

        if repo is None:
            repo = Repository()

        return repo.get_object(q[0]['pid'], type=EmailMessage)
Beispiel #22
0
    def find_by_field(field, value, repo=None):
        '''
        Static method to find a single :class:`EmailMessage` by an indexed
        value.  Looks for the item in Solr and
        returns an :class:`EmailMessage` instance initialized
        from the repository if a single match is found for the
        requested field and value.

        Raises :class:`django.core.exceptions.MultipleObjectsReturned`
        if more than one match is found; raises
        :class:`django.core.exceptions.ObjectDoesNotExist` if no
        matches are found in the Solr index.

        :param field: solr field to search
        :param value: value to search on in the specified field

        :param repo: optional :class:`eulfedora.server.Repository`
            to use an existing connection with specific credentials

        :returns: :class:`EmailMessage`


        '''
        solr = solr_interface()
        search_terms = {
            field: value,
            'content_model': ArrangementObject.ARRANGEMENT_CONTENT_MODEL
        }
        q = solr.query(**search_terms).field_limit('pid')

        # check that we found one and only one
        found = len(q)
        # borrowing custom django exceptions for not found / too many
        # matches
        if found > 1:
            raise MultipleObjectsReturned('Found %d records with %s %s' % \
                                          (found, field, value))
        if not found:
            raise ObjectDoesNotExist('No record found with %s %s' % (field, value))

        if repo is None:
            repo = Repository()

        return repo.get_object(q[0]['pid'], type=EmailMessage)
    def disk_images(self):
        self.stderr.write('Disk images')
        ### disk images
        # representative sample of aff and ad1
        # DO NOT include anything in these collections:
        # Trethewey (ghsdj), Rushdie (94k9k), Mackey (g1btw),
        # Clifton (94kf4), and Grennan (9k0st)

        solr = solr_interface()
        repo = Repository()
        q = solr.query(content_model=DiskImage.DISKIMAGE_CONTENT_MODEL) \
                .exclude(collection_id=self.collections['trethewey']) \
                .exclude(collection_id=self.collections['rushdie']) \
                .exclude(collection_id=self.collections['mackey']) \
                .exclude(collection_id=self.collections['clifton']) \
                .exclude(collection_id=self.collections['grennan']) \
                .field_limit('pid')
        if self.verbosity >= self.v_normal:
            self.stderr.write('Found %d disk images not in restricted collections' % q.count())

        # currently there is no way to filter on format or size in either
        # solr or fedora risearch
        # so, go through individually and group them by type,
        # then sort by size and pick the smallest ones
        diskimgs_by_type = defaultdict(list)
        for result in q:
            diskimg = repo.get_object(result['pid'], type=DiskImage)
            if not diskimg.exists:
                if self.verbosity >= self.v_normal:
                    self.stderr.write('Referenced disk image %s does not exist or is inaccessible' \
                        % result['pid'])
                continue

            fmt = diskimg.provenance.content.object.format.name
            diskimgs_by_type[fmt].append(diskimg)

        for fmt, diskimages in diskimgs_by_type.iteritems():
            if self.verbosity >= self.v_normal:
                self.stderr.write('Selecting %s disk images' % fmt)
            # sort on binary file size so we sync the smallest ones
            diskimages = sorted(diskimages, key=lambda diskimg: diskimg.content.size)
            # use the first 10 of each type
            for d in diskimages[:10]:
                self.stdout.write(d.pid)
Beispiel #24
0
def playlist(request, pid):
    # FIXME: this needs last-modified so browser can cache!!!

    # NOTE: preliminary logic duplicated from view above
    repo = Repository(request=request)
    obj = repo.get_object(pid, type=CollectionObject)
    # if pid doesn't exist or isn't a collection, 404
    if not obj.exists or not obj.has_requisite_content_models:
        raise Http404

    # search for all items that belong to this collection
    q = obj.solr_items_query()
    q = q.sort_by('date_created') \
         .sort_by('date_issued') \
         .sort_by('title_exact')
    # filter by logged-in user permissions
    # (includes researcher-accessible content filter when appropriate)
    q = filter_by_perms(q, request.user)

    # if current user can only view researcher-accesible collections and
    # no items were found, they don't have permission to view this collection
    if not request.user.has_perm('collection.view_collection') and \
           request.user.has_perm('collection.view_researcher_collection') and \
           q.count() == 0:
       return prompt_login_or_403(request)

    playlist = []
    for result in q:
        # skip non-audio or audio without access copies
        if result['object_type'] != 'audio' or not result['has_access_copy']:
            continue
        data = {
            'title': result['title'],
            'free': False  # explicitly mark as not downloadable
        }
        if result['access_copy_mimetype'] == 'audio/mp4':
            audio_type = 'm4a'
        else:
            audio_type = 'mp3'
        data[audio_type] = reverse('audio:download-compressed-audio',
            kwargs={'pid': result['pid'], 'extension': audio_type})
        playlist.append(data)

    return HttpResponse(json.dumps(playlist), content_type='application/json')
Beispiel #25
0
def simple_edit(request, pid=None):
    ''' Edit an existing Fedora
    :class:`~keep.collection.models.SimpleCollection`.  If a pid is
    specified, attempts to retrieve an existing object.
    '''
    repo = Repository(request=request)

    try:
        obj = repo.get_object(pid=pid, type=SimpleCollection)

        if request.method == 'POST':
            form = SimpleCollectionEditForm(request.POST)
            if form.is_valid():
                status = form.cleaned_data['status']


                if status == obj.mods.content.restrictions_on_access.text:
                    # don't queue job if there is no change
                    messages.info(request, 'Status is unchanged')

                else:
                    # queue celery task to update items in this batch
                    queue_batch_status_update(obj, status)
                    messages.info(
                        request,
                        'Batch status update has been queued; ' +
                        'please check later via <a href="%s">recent tasks</a> page' %
                        reverse('tasks:recent')
                    )

        else:
            #Just Display the form
            form = SimpleCollectionEditForm(initial={'status': obj.mods.content.restrictions_on_access.text})

    except RequestFailed, e:
        # if there was a 404 accessing objects, raise http404
        # NOTE: this probably doesn't distinguish between object exists with
        # no MODS and object does not exist at all
        if e.code == 404:
            raise Http404
        # otherwise, re-raise and handle as a common fedora connection error
        else:
            raise
Beispiel #26
0
    def by_arrangement_id(id, repo=None):
        '''
        Static method to find an :class:`ArrangementObject` by its
        local or arrangement id.  Looks for the item in Solr and
        returns an :class:`ArrangementObject` instance initialized
        from the repository if a single match is found for the
        requested id.

        Raises :class:`django.core.exceptions.MultipleObjectsReturned`
        if more than one match is found; raises
        :class:`django.core.exceptions.ObjectDoesNotExist` if no
        matches are found in the Solr index.

        :param id: arrangement id or local id

        :param repo: optional :class:`eulfedora.server.Repository`
            to use an existing connection with specific credentials

        :returns: :class:`ArrangementObject`


        '''
        solr = solr_interface()
        q = solr.query(arrangement_id=id,
                   content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) \
                   .field_limit('pid')

        # check that we found one and only one
        found = len(q)
        # borrowing custom django exceptions for not found / too many
        # matches
        if found > 1:
            raise MultipleObjectsReturned('Found %d records with arrangement id %s' % \
                                          (found, id))
        if not found:
            raise ObjectDoesNotExist('No record found with arrangement id %s' %
                                     id)

        if repo is None:
            repo = Repository()

        return repo.get_object(q[0]['pid'], type=ArrangementObject)
Beispiel #27
0
    def by_arrangement_id(id, repo=None):
        '''
        Static method to find an :class:`ArrangementObject` by its
        local or arrangement id.  Looks for the item in Solr and
        returns an :class:`ArrangementObject` instance initialized
        from the repository if a single match is found for the
        requested id.

        Raises :class:`django.core.exceptions.MultipleObjectsReturned`
        if more than one match is found; raises
        :class:`django.core.exceptions.ObjectDoesNotExist` if no
        matches are found in the Solr index.

        :param id: arrangement id or local id

        :param repo: optional :class:`eulfedora.server.Repository`
            to use an existing connection with specific credentials

        :returns: :class:`ArrangementObject`


        '''
        solr = solr_interface()
        q = solr.query(arrangement_id=id,
                   content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) \
                   .field_limit('pid')

        # check that we found one and only one
        found = len(q)
        # borrowing custom django exceptions for not found / too many
        # matches
        if found > 1:
            raise MultipleObjectsReturned('Found %d records with arrangement id %s' % \
                                          (found, id))
        if not found:
            raise ObjectDoesNotExist('No record found with arrangement id %s' % id)

        if repo is None:
            repo = Repository()

        return repo.get_object(q[0]['pid'], type=ArrangementObject)
Beispiel #28
0
def view(request, pid):
    '''View a single :class:`~keep.audio.models.AudioObject`.
    User must either have general view audio permissions, or if they have
    view researcher audio, the object must be researcher accessible
    (based on rights codes).
    '''
    repo = Repository(request=request)
    obj = repo.get_object(pid, type=AudioObject)
    # user either needs view audio permissions OR
    # if they can view researcher audio and object must be researcher-accessible
    if not request.user.has_perm('audio.view_audio') and \
       not (request.user.has_perm('audio.view_researcher_audio') and \
       bool(obj.researcher_access)):
        return prompt_login_or_403(request)

    try:
        if not obj.has_requisite_content_models:
            raise Http404
    except:
        raise Http404

    return TemplateResponse(request, 'audio/view.html', {'resource': obj})
Beispiel #29
0
def view(request, pid):
    '''View a single :class:`~keep.audio.models.AudioObject`.
    User must either have general view audio permissions, or if they have
    view researcher audio, the object must be researcher accessible
    (based on rights codes).
    '''
    repo = Repository(request=request)
    obj = repo.get_object(pid, type=AudioObject)
    # user either needs view audio permissions OR
    # if they can view researcher audio and object must be researcher-accessible
    if not request.user.has_perm('audio.view_audio') and \
       not (request.user.has_perm('audio.view_researcher_audio') and \
       bool(obj.researcher_access)):
        return prompt_login_or_403(request)

    try:
        if not obj.has_requisite_content_models:
            raise Http404
    except:
        raise Http404

    return TemplateResponse(request, 'audio/view.html', {'resource': obj})
Beispiel #30
0
def largefile_ingest(request):
    '''Large-file ingest.  On GET, displays a form allowing user to
    select a BagIt that has been uploaded to the configured large-file
    ingest staging area for ingest and association with a collection.
    '''
    # ingest content from upload staging area

    context = {}
    template_name = 'file/largefile_ingest.html'
    form = None

    # on POST, process the form and ingest if valid
    if request.method == 'POST':
        form = LargeFileIngestForm(request.POST)

        # if form is not valid, add to context for redisplay with errors
        if not form.is_valid():
            context['form'] = form

        # otherwise, process the form
        else:
            repo = Repository(request=request)

            # Get collection & check for optional comment
            collection = repo.get_object(pid=form.cleaned_data['collection'],
                                         type=CollectionObject)
            # get user comment if any; default to a generic ingest comment
            comment = form.cleaned_data['comment'] or 'initial repository ingest'
            bag = form.cleaned_data['bag']

            # create dict with file info to add success/failure info
            file_info = {'label': os.path.basename(bag)}

            #assuming type of ingest from subdirectory
            type = bag.split('/')[-2]
            try:

                if type == 'diskimage':
                    obj = DiskImage.init_from_bagit(bag, request)

                elif type == 'video':
                    obj = Video.init_from_bagit(bag, request)

                # set collection on ingest
                obj.collection = collection

                ## NOTE: Due to a bug in Fedora 3.4 with checksums and
                ## and file uri ingest, the content datastream checksum
                ## must be cleared before ingest; manually check it
                ## after ingest to confirm Fedora calculated what we expect.
                ## This work-around can be removed once we upgrade to Fedora 3.6

                # store datastream checksum that would be sent to fedora
                checksum = obj.content.checksum
                obj._content_checksum = checksum
                # clear it out so Fedora can ingest without erroring
                obj.content.checksum = None

                # file URIs also used for supplemental files; needs
                # to be handled the same way as content datastream
                # - look for any supplementN datastreams, store checksum, and remove
                supplemental_checksums = {}
                for i in range(20):
                    try:
                        dsid = 'supplement%d' % i
                        dsobj = getattr(obj, dsid)
                        supplemental_checksums[dsid] = dsobj.checksum
                        dsobj.checksum = None
                    except AttributeError:
                        # stop iterating - we have found last supplemental file
                        break

                # same for access copy checksum on Video files
                if type == 'video':
                    access_checksum = obj.access_copy.checksum
                    obj.access_copy.checksum = None
                
                pids_exists = []
                if type == 'video':
                    pids_exists = repo.find_objects(type=Video, label=obj.label)
                
                if type == 'diskimage':
                    pids_exists = repo.find_objects(type=DiskImage, label=obj.label)

                exists = 0
                for pid in pids_exists:
                    if pid.pid:
                        exists += 1

                if exists == 0:
                    obj.save(comment)
                else:
                    raise ValueError('Duplicate content detected.')


                # remove the ingested bag from large-file staging area
                shutil.rmtree(bag)

                # re-init to allow checking fedora-calculated checksums on
                # supplemental datastreams
                if type == 'diskimage':
                    obj = repo.get_object(obj.pid, type=DiskImage)
                elif type == 'video':
                    obj = repo.get_object(obj.pid, type=Video)

                # if save succeded (no exceptions), set summary info for display
                file_info.update({'type' : type, 'success': True,
                                  'pid': obj.pid, 'url': obj.get_absolute_url(),
                                  'checksum': obj.content.checksum})
                if type == 'video':
                    file_info['access_checksum'] = obj.access_copy.checksum

                # compare checksum generated by Fedora
                # (required because of file uri bug in fedora 3.4;
                #  this can be removed once we upgrade to fedora 3.6+)
                checksum_errors = []

                if obj.content.checksum != checksum:
                    checksum_errors.append('content')

                for dsid, checksum in supplemental_checksums.iteritems():
                    dsobj = obj.getDatastreamObject(dsid)
                    if dsobj.checksum != checksum:
                        checksum_errors.append(dsid)

                if type == 'video' and obj.access_copy.checksum != access_checksum:
                    checksum_errors.append('access_copy')

                if checksum_errors:
                    message = 'Checksum mismatch%s detected on ' + \
                       '%s datastream%s; please contact a repository administrator.'''
                    file_info['message'] = message % (
                        'es' if len(checksum_errors) > 1 else '',
                        ', '.join(checksum_errors),
                        's' if len(checksum_errors) > 1 else ''
                    )

            except bagit.BagValidationError as err:
                logger.error(err)
                file_info.update({'success': False, 'message': 'BagIt error: %s' % err})

            # special case: detected as duplicate content
            except DuplicateContent as e:
                # mark as failed and generate message with links to records
                # NOTE: pid url is duplicated logic from web upload view...
                links = []
                for pid in e.pids:
                    # use fedora type-inferring logic with list of content models
                    # pulled from solr results
                    obj = repo.get_object(pid,
                        type=repo.best_subtype_for_object(pid, e.pid_cmodels[pid]))
                    # use appropriate object class to get the object url
                    links.append('<a href="%s">%s</a>' % (
                        obj.get_absolute_url(), pid)
                    )
                msg = mark_safe('%s: %s' % (unicode(e), '; '.join(links)))
                file_info.update({
                    'success': False,
                    'message': msg
                })

            except Exception as err:
                logger.error('Error: %s' % err)
                file_info.update({'success': False, 'message': '%s' % err})

            # report success/failure in the same format as web-upload ingest
            context['ingest_results'] = [file_info]
            messages.success(request, 'Ingest results: %s' % file_info)
            return HttpResponseRedirect("/admin")

    # on GET display form to select item(s) for ingest
    # OR on completed valid form post
    files = large_file_uploads()
    if request.method == 'GET' or \
      form is not None and form.is_valid():
        if len(files):
            context['form'] = LargeFileIngestForm()
        else:
            # indicator that no files are available for ingest
            context['no_files'] = True

    return TemplateResponse(request, template_name, context)
Beispiel #31
0
    def generate_collection(self):
        '''Generate a :class:`CollectionObject` with fields pre-populated
        based on the contents of the current Finding Aid object.
        '''
        repo = Repository()
        coll = repo.get_object(type=CollectionObject)
        # TODO: archive membership?

        # title - using 'short' form without unitdate, stripping any trailing whitespace & . or ,
        # TODO/FIXME: does NOT work for unittitles with nested tags, e.g. title - see pomerantz
        coll.mods.content.title = unicode(self.unittitle.short).rstrip().rstrip('.,')
        # main entry/name - origination, if any
        if self.archdesc.did.origination:
            name_text = unicode(self.archdesc.did.origination)
            # determine type of name
            colltype = self.archdesc.did.node.xpath('''local-name(e:origination/e:persname |
                e:origination/e:corpname  | e:origination/e:famname)''',
                namespaces=self.ROOT_NAMESPACES)
            if colltype == 'persname':
                name_type = 'personal'
            elif colltype == 'famname':
                name_type = 'family'
                # family names consistently end with a period, which can be removed
                name_text = name_text.rstrip('.')
            elif colltype == 'corpname':
                name_type = 'corporate'

            if name_type is not None:
                coll.mods.content.create_name()
                coll.mods.content.name.type = name_type

            authority = self.archdesc.did.node.xpath('string(e:origination/*/@source)',
                namespaces=self.ROOT_NAMESPACES)
            # lcnaf in the EAD is equivalent to naf in MODS
            if authority == 'lcnaf':
                coll.mods.content.name.authority = 'naf'

            coll.mods.content.name.name_parts.append(mods.NamePart(text=name_text))

        # date coverage
        if self.coverage:
            date_encoding = {'encoding': 'w3cdtf'}
            # date range
            coll.mods.content.create_origin_info()
            if '/' in self.coverage:
                start, end = self.coverage.split('/')
                coll.mods.content.origin_info.created.append(mods.DateCreated(date=start,
                    point='start', key_date=True, **date_encoding))
                coll.mods.content.origin_info.created.append(mods.DateCreated(date=end,
                    point='end', **date_encoding))
            # single date
            else:
                coll.mods.content.origin_info.created.append(mods.DateCreated(date=self.coverage,
                    key_date=True, **date_encoding))

        # source id - numeric form of the manuscript/archive collection number
        coll.mods.content.source_id = self.archdesc.did.unitid.identifier

        # access restriction
        if self.archdesc.access_restriction:
            coll.mods.content.create_restrictions_on_access()
            coll.mods.content.restrictions_on_access.text = "\n".join([
                    unicode(c) for c in self.archdesc.access_restriction.content])

        # use & reproduction
        if self.archdesc.use_restriction:
            coll.mods.content.create_use_and_reproduction()
            coll.mods.content.use_and_reproduction.text = "\n".join([
                    unicode(c) for c in self.archdesc.use_restriction.content])

        # set initial mods:typeOfResource - not specified in EAD, but all
        # collections shoud be mixed material
        coll.mods.content.resource_type = 'mixed material'

        # EAD url - where does this go?
        # accessible at self.eadid.url

        return coll
Beispiel #32
0
def ingest_files(files, collection, comment, request):
    '''Ingest a dictionary of files as returned by
    :meth:`keep.files.forms.UploadForm.files_to_ingest`.
    Returns a dictionary reporting per-file ingest success or failure.

    :param files: dictionary of files to be ingested
    :param collection: :class:`~keep.collection.models.CollectionObject` that
        newly ingested objects should be associated with
    :param comment: save message for fedora ingest
    :param request: :class:`~django.http.HttpRequest`, to access Fedora and
        ingest new objects as the logged-in user.
    '''

    # NOTE: using this structure for easy of display in django templates (e.g., regroup)
    results = []

    m = magic.Magic(mime=True)
    for filename, label in files.iteritems():

        file_info = {'label': label}

        # check if file is an allowed type

        # NOTE: for single-file upload, browser-set type is
        # available as UploadedFile.content_type - but since
        # browser mimetypes are unreliable, calculate anyway
        try:
            type = m.from_file(filename)
        except IOError:
            raise Exception('Uploaded file is no longer available for ingest; please try again.')

        type, separator, options = type.partition(';')
        if type not in allowed_upload_types(request.user):
            # store error for display on detailed result page
            file_info.update({'success': False,
                              'message': '''File type '%s' is not allowed''' % type})
            # if not an allowed type, no further processing
            results.append(file_info)
            continue

        if collection is None:
            file_info.update({'success': False,
                              'message': '''Collection not selected'''})
            results.append(file_info)
            continue

        # if there is an MD5 file (i.e., file was uploaded via ajax),
        # use the contents of that file as checksum
        if os.path.exists(filename + '.md5'):
            with open(filename + '.md5') as md5file:
                md5 = md5file.read()
        # otherwise, calculate the MD5 (single-file upload)
        else:
            md5 = md5sum(filename)

        # determine what type of object to initialize based on mimetype
        objtype = None
        for t in uploadable_objects:
            if type in t.allowed_mimetypes:
                objtype = t
                break

        # initialize a new object from the file
        obj = objtype.init_from_file(filename, initial_label=label,
                                     request=request, checksum=md5,
                                     mimetype=type)

        # set collection on ingest
        obj.collection = collection

        try:
            # NOTE: by sending a log message, we force Fedora to store an
            # audit trail entry for object creation, which doesn't happen otherwise
            obj.save(comment)
            file_info.update({'success': True, 'pid': obj.pid,
                              'url': obj.get_absolute_url(),
                              'checksum': md5})

            # if audio, needs an additional step:
            if objtype == AudioObject:
                # Start asynchronous task to convert audio for access
                # NOTE: not passing in user-upload file so that
                # celery can more easily be run on a separate server
                queue_access_copy(obj)
                # remove the file now that we have sucessfully ingested
                os.remove(filename)

            # NOTE: could remove MD5 file (if any) here, but MD5 files
            # should be small and will get cleaned up by the cron script

        # special case: detected as duplicate content
        except DuplicateContent as e:
            # mark as failed and generate message with links to records
            links = []
            repo = Repository(request=request)
            for pid in e.pids:
                # use fedora type-inferring logic with list of content models
                # pulled from solr results
                obj = repo.get_object(pid,
                    type=repo.best_subtype_for_object(pid, e.pid_cmodels[pid]))
                # use appropriate object class to get the object url
                links.append('<a href="%s">%s</a>' % (
                    obj.get_absolute_url(), pid)
                )

            msg = mark_safe('%s: %s' % (unicode(e), '; '.join(links)))
            file_info.update({
                'success': False,
                'message': msg
            })

        except Exception as e:
            logger.error('Error ingesting %s: %s' % (filename, e))
            logger.debug("Error details:\n" + traceback.format_exc())
            file_info['success'] = False

            # check for Fedora-specific errors
            if isinstance(e, RequestFailed):
                if 'Checksum Mismatch' in e.detail:
                    file_info['message'] = 'Ingest failed due to a checksum mismatch - ' + \
                        'file may have been corrupted or incompletely uploaded to Fedora'
                else:
                    file_info['message'] = 'Fedora error: ' + unicode(e)

            # non-fedora error
            else:
                file_info['message'] = 'Ingest failed: ' + unicode(e)

        finally:
            # no matter what happened, store results for reporting to user
            results.append(file_info)

    return results
Beispiel #33
0
def manage_supplements(request, pid):
    '''Manage supplemental file datastreams associated with a
    :class:`~keep.file.models.DiskImage`.'''
    repo = Repository(request=request)
    obj = repo.get_object(pid, type=DiskImage)
    if not obj.exists or not obj.has_requisite_content_models:
        raise Http404

    # generate initial data from any existing supplemental datastreams
    initial_data = []
    for s in obj.supplemental_content:
        initial_data.append({
            'dsid': s.id,
            'label': s.label,
            'file': DatastreamFile(obj.pid, s.id, s.label)
        })

    # on get, just display the form
    if request.method == 'GET':
        formset = SupplementalFileFormSet(initial=initial_data)

    # on post, process the form and any updates/additions
    if request.method == 'POST':
        formset = SupplementalFileFormSet(request.POST,
                                          request.FILES,
                                          initial=initial_data)

        if formset.is_valid():
            m = magic.Magic(mime=True)

            # NOTE: because we currently don't support re-ordering
            # or deletion, simply counting to keep track of datastream ids
            s_id = 0
            modified = 0
            added = 0
            for file_info in formset.cleaned_data:
                # skip empty formset
                if not file_info:
                    continue

                if file_info.get('dsid', None):
                    ds = obj.getDatastreamObject(
                        file_info['dsid'], dsobj_type=FileDatastreamObject)
                    # ds = getattr(obj, file_info['dsid'])
                else:
                    added += 1
                    ds = obj.getDatastreamObject(
                        'supplement%d' % s_id, dsobj_type=FileDatastreamObject)

                # only set if changed so datastream isModified is accurate
                if file_info['label'] != ds.label:
                    ds.label = file_info['label']

                # if this is an uploaded file, replace content and calculate mimetype, checksum
                if isinstance(file_info['file'], UploadedFile):

                    filename = file_info['file'].temporary_file_path()
                    mimetype = m.from_file(filename)
                    mimetype, separator, options = mimetype.partition(';')
                    ds.mimetype = mimetype
                    ds.checksum = md5sum(filename)
                    ds.content = file_info['file']

                if ds.exists and ds.isModified():
                    modified += 1

                s_id += 1

            try:
                obj.save('updating supplemental files')

                # summarize number of changes, if any
                if added or modified:
                    msg_add = 'added %d' % added if added else ''
                    msg_update = 'updated %d' % modified if modified else ''
                    msg = 'Successfully %s%s%s supplemental file%s' %  \
                        (msg_add, ' and ' if added and modified else '', msg_update,
                        's' if (added + modified) != 1 else '')
                    messages.success(request, msg)
                else:
                    # possible for the form to be valid but not make any changes
                    messages.info(request,
                                  'No changes made to supplemental content')

                return HttpResponseSeeOtherRedirect(
                    reverse('file:edit', args=[pid]))

            except Exception as e:
                logger.error('Error on supplemental file update: %s' % e)
                logger.debug("Error details:\n" + traceback.format_exc())

                messages.error(request, unicode(e))
                # for now, just redisplay the form with error message

    return TemplateResponse(request, 'file/supplemental_content.html', {
        'obj': obj,
        'formset': formset
    })
Beispiel #34
0
def upload(request):
    '''Upload file(s) and create new fedora :class:`~keep.audio.models.AudioObject` (s).
    Only accepts audio/x-wav currently.

    There are two distinct ways to upload file. The first case is
    kicked off when "fileManualUpload" exists in the posted form. If
    it does, then this was not a HTML5 browser, and the file upload
    occurs as is usual for a single file upload.

    In the other approach, the file was uploaded via a HTML5 ajax
    upload already. In this case, we are reading in various hidden
    generated form fields that indicate what was uploaded from the
    javascript code.
    '''
    repo = Repository(request=request)

    ctx_dict = {
        # list of allowed file types, in a format suited for passing to javascript
        'js_allowed_types':
        mark_safe(json.dumps(allowed_upload_types(request.user)))
    }

    if request.method == 'POST':
        content_type = request.META.get('CONTENT_TYPE',
                                        'application/octet-stream')
        media_type, sep, options = content_type.partition(';')
        # content type is technically case-insensitive; lower-case before comparing
        media_type = media_type.strip().lower()

        # if form has been posted, process & ingest files
        if media_type == 'multipart/form-data':

            # check for a single file upload
            form = UploadForm(request.POST, request.FILES)

            # If form is not valid (i.e., no collection specified, no
            # or mismatched files uploaded), bail out and redisplay
            # form with any error messages.
            if not form.is_valid():
                ctx_dict['form'] = form
                return TemplateResponse(request, 'file/upload.html', ctx_dict)

            # Form is valid. Get collection & check for optional comment
            collection = repo.get_object(pid=form.cleaned_data['collection'],
                                         type=CollectionObject)
            # get user comment if any; default to a generic ingest comment
            comment = form.cleaned_data[
                'comment'] or 'initial repository ingest'
            # get dictionary of file path -> filename, based on form data
            files_to_ingest = form.files_to_ingest()

            # process all files submitted for ingest (single or batch mode)
            if files_to_ingest:
                results = ingest_files(files_to_ingest, collection, comment,
                                       request)

                # add per-file ingest result status to template context
                ctx_dict['ingest_results'] = results
                # after processing files, fall through to display upload template

        else:
            # POST but not form data - handle ajax file upload
            return ajax_upload(request)

    # on GET or non-ajax POST, display the upload form
    ctx_dict['form'] = UploadForm()
    # convert list of allowed types for passing to javascript

    return TemplateResponse(request, 'file/upload.html', ctx_dict)
Beispiel #35
0
class EmailMessageTest(KeepTestCase):
    def setUp(self):
        self.repo = Repository()
        self.pids = []

        # test EmailMessage
        self.email = self.repo.get_object(type=EmailMessage)
        self.email.cerp.content.from_list = ['*****@*****.**']
        self.email.cerp.content.to_list = ['*****@*****.**']
        self.email.cerp.content.subject_list = ['Interesting Subject']

    def tearDown(self):
        for pid in self.pids:
            self.repo.purge_object(pid)

    def test_headers(self):
        h1 = cerp.Header()
        h1.name = "HEADER 1"
        h1.value = "value for header 1"
        h2 = cerp.Header()
        h2.name = "HEADER 2"
        h2.value = "value for header 2"
        self.email.cerp.content.headers.append(h1)
        self.email.cerp.content.headers.append(h2)
        self.assertEqual(self.email.headers['HEADER 1'], 'value for header 1')
        self.assertEqual(self.email.headers['HEADER 2'], 'value for header 2')

    def test_email_label(self):
        # no object label and one person in to field
        label = self.email.email_label()
        self.assertEqual(
            'Email from [email protected] to [email protected] Interesting Subject',
            label, 'Should construct label when it does not exist')

        # more then one person in to list
        self.email.cerp.content.to_list.append('*****@*****.**')
        label = self.email.email_label()
        self.assertEqual(
            'Email from [email protected] to [email protected] et al. Interesting Subject',
            label,
            'only show first to email address when there are more than one')

        # no subject
        self.email.cerp.content.subject_list = []
        self.assertEqual(
            'Email from [email protected] to [email protected] et al.',
            self.email.email_label(),
            'Display message without subject when no subject is present')

        # has a date
        date_header = cerp.Header()
        date_header.name = 'Date'
        date_header.value = 'Friday 13 200 13:00'
        self.email.cerp.content.headers.append(date_header)
        label = self.email.email_label()
        self.assertEqual(
            'Email from [email protected] to [email protected] et al. on Friday 13 200 13:00',
            label,
            'only show first to email address when there are more than one')

        # object label already exists
        self.email.label = "label we want to keep"
        label = self.email.email_label()
        self.assertEqual(self.email.label, label,
                         'label should be preserved when it exists')

    def test_index_data(self):
        # NOTE: logic for creating the label is in the label test

        # test to make sure label exists in index data
        data = self.email.index_data()
        self.assertIn('label', data.keys())
        # mime_data does not exist, so no c
        self.assert_(
            'content_md5' not in data,
            'content_md5 should not be set when mime data does not exist')

        # patch mime data to test exists /cchecksum
        with patch.object(self.email, 'mime_data', Mock()) as mock_mime:
            mock_mime.exists = True
            mock_mime.checksum = 'test checksum value'

            data = self.email.index_data()
            self.assertEqual(self.email.mime_data.checksum,
                             data['content_md5'])

    @patch('keep.arrangement.models.solr_interface',
           spec=sunburnt.SolrInterface)
    def test_by_checksum(self, mocksolr):
        # no match
        self.assertRaises(ObjectDoesNotExist, EmailMessage.by_checksum, 42)
        solr = mocksolr.return_value
        solr.query.assert_called_with(
            content_md5=42,
            content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL)
        solr.query.return_value.field_limit.assert_called_with('pid')

        # too many matches
        solr.query.return_value.field_limit.return_value = [{
            'pid': 'pid:1'
        }, {
            'pid': 'pid:2'
        }]
        self.assertRaises(MultipleObjectsReturned, EmailMessage.by_checksum,
                          42)

        # one match
        solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}]
        em = EmailMessage.by_checksum(42)
        self.assert_(isinstance(em, EmailMessage))

        # custom repo object
        mockrepo = Mock()
        em = EmailMessage.by_checksum(42, mockrepo)
        mockrepo.get_object.assert_called_with('pid:1', type=EmailMessage)

    @patch('keep.arrangement.models.solr_interface',
           spec=sunburnt.SolrInterface)
    def test_by_message_id(self, mocksolr):
        # no match
        self.assertRaises(ObjectDoesNotExist, EmailMessage.by_message_id,
                          '<*****@*****.**>')
        solr = mocksolr.return_value
        solr.query.assert_called_with(
            arrangement_id='<*****@*****.**>',
            content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL)
        solr.query.return_value.field_limit.assert_called_with('pid')
Beispiel #36
0
def edit(request, pid):
    '''Edit the metadata for a single :class:`~keep.file.models.DiskImage`.'''
    # FIXME: should be generic file (?) or possibly one of several supported files
    repo = Repository(request=request)
    obj = repo.get_object(pid, type=DiskImage)
    try:
        # if this is not actually a disk image, then 404 (object is not available at this url)
        if not obj.has_requisite_content_models:
            raise Http404

        if request.method == 'POST':

            # if data has been submitted, initialize form with request data and object mods
            form = DiskImageEditForm(request.POST, instance=obj)
            if form.is_valid():  # includes schema validation
                # update foxml object with data from the form
                form.update_instance()
                if 'comment' in form.cleaned_data \
                         and form.cleaned_data['comment']:
                    comment = form.cleaned_data['comment']
                else:
                    comment = "update metadata"

                obj.save(comment)
                messages.success(request, 'Successfully updated <a href="%s">%s</a>' % \
                        (reverse('file:edit', args=[pid]), pid))
                # save & continue functionality - same as collection edit
                if '_save_continue' not in request.POST:
                    return HttpResponseSeeOtherRedirect(
                        reverse('repo-admin:dashboard'))
                # otherwise - fall through to display edit form again

            # form was posted but not valid
            else:
                # if we attempted to save and failed, add a message since the error
                # may not be obvious or visible in the first screenful of the form
                messages.error(
                    request,
                    '''Your changes were not saved due to a validation error.
                    Please correct any required or invalid fields indicated below and save again.'''
                )

        else:
            # GET - display the form for editing, pre-populated with content from the object
            form = DiskImageEditForm(instance=obj)

        class AdminOpts(object):
            app_label = 'file'
            model_name = 'application'

        # options for generating admin link to edit/add file application db info
        admin_fileapp = AdminOpts()

        return TemplateResponse(request, 'file/edit.html', {
            'obj': obj,
            'form': form,
            'admin_fileapp': admin_fileapp
        })

    except PermissionDenied:
        # Fedora may return a PermissionDenied error when accessing a datastream
        # where the datastream does not exist, object does not exist, or user
        # does not have permission to access the datastream

        # check that the object exists - if not, 404
        if not obj.exists:
            raise Http404
        # for now, assuming that if object exists and has correct content models,
        # it will have all the datastreams required for this view

        return HttpResponseForbidden('Permission Denied to access %s' % pid,
                                     content_type='text/plain')

    except RequestFailed as rf:
        # if fedora actually returned a 404, propagate it
        if rf.code == 404:
            raise Http404

        msg = 'There was an error contacting the digital repository. ' + \
              'This prevented us from accessing audio data. If this ' + \
              'problem persists, please alert the repository ' + \
              'administrator.'
        return HttpResponse(msg, content_type='text/plain', status=500)
Beispiel #37
0
def batch_set_status(pid, status):
    repo = Repository()
    batch = repo.get_object(pid, type=SimpleCollection)
    # keep track of totals for success and failure
    success = 0
    error = 0

    # translate form status codes to fedora state code
    # TODO: shift this logic to arrangement object for re-use ?
    codes = {'Processed': 'A', 'Accessioned': 'I'}

    # target state for every object in the collection
    if status not in codes:
        err_msg = 'Status %s unknown' % status
        logger.error(err_msg)
        raise Exception(err_msg)
    else:
        state = codes[status]

    # finp all pids associated with this object
    pids = list(
        batch.rels_ext.content.objects(batch.uriref, relsextns.hasMember))

    for pid in pids:
        try:
            # pass in api from batch object to retain user credentials
            obj = ArrangementObject(batch.api, pid)
            obj.state = state
            obj.save('Marking as %s via SimpleCollection %s' %
                     (status, batch.pid))
            success += 1
        except Exception as e:
            logger.error('Failed to update %s : %s' % (pid, e))
            error += 1

    info = {
        'success': success,
        'error': error,
        'success_plural': '' if success == 1 else 's',
        'error_plural': '' if error == 1 else 's',
        'status': status
    }

    summary_msg = "Successfully updated %(success)s item%(success_plural)s; error updating %(error)s" % info

    # if not all objects were updated correctly, exit with error
    if error > 0:
        raise Exception(summary_msg)

    # FIXME: this is based on the current form logic, but could leave
    # some member items stranded in a different status than the parent object

    batch.mods.content.create_restrictions_on_access()
    batch.mods.content.restrictions_on_access.text = status  # Change collection status
    try:
        batch.save(
            'Marking as %(status)s; updated %(success)s member item%(success_plural)s'
            % info)

    except Exception as e:
        save_err = "Error updating SimpleCollection %s - %s" % (obj.pid, e)
        logger.error(save_err)
        raise Exception('%s; %s' % (save_err, summary_msg))

    # success
    return 'Successfully updated %(success)s item%(success_plural)s' % info
Beispiel #38
0
class TestMigrateRushdie(TestCase):
    MM_FIXTURE ='''<macfs:document xmlns:macfs="info:fedora/emory-control:Rushdie-MacFsData-1.0">
  <macfs:md5>ffcf48e5df673fc7de985e1b859eeeec</macfs:md5>
  <macfs:file>
    <macfs:computer>Performa 5400</macfs:computer>
    <macfs:path>/Hard Disk/MIDNIGHT&apos;S CHILDREN/MISC. MATERIAL/x - the roles</macfs:path>
    <macfs:rawpath>L0hhcmQgRGlzay9NSUROSUdIVCdTIENISUxEUkVOL01JU0MuIE1BVEVSSUFML3ggLSB0aGUgcm9sZXM=</macfs:rawpath>
    <macfs:attributes>avbstclInmedz</macfs:attributes>
    <macfs:created>1997-01-19T19:29:32</macfs:created>
    <macfs:modified>1997-01-19T19:29:32</macfs:modified>
    <macfs:type>TEXT</macfs:type>
    <macfs:creator>ttxt</macfs:creator>
  </macfs:file>
</macfs:document>'''

    MA_FIXTURE ='''<marbl:analysis xmlns:marbl="info:fedora/emory-control:Rushdie-MarblAnalysis-1.0">
  <marbl:series>Writings by Rushdie</marbl:series>
  <marbl:subseries>Fiction</marbl:subseries>
  <marbl:verdict>As is</marbl:verdict>
</marbl:analysis>'''

    SERIES_FIXTURE = {'Writings by Rushdie':
              { 'series_info':
                   {'base_ark': 'http://testpid.library.emory.edu/ark:/25593/80mvk',
                        'id': 'rushdie1000_series2',
                        'short_id': 'series2',
                        'uri': 'https://findingaids.library.emory.edu/documents/rushdie1000/series2'},
              'subseries_info': {   'Fiction': {   'base_ark': 'http://testpid.library.emory.edu/ark:/25593/80mvk',
                                            'id': 'rushdie1000_subseries2.1',
                                            'short_id': 'subseries2.1',
                                            'uri': 'https://findingaids.library.emory.edu/documents/rushdie1000/series2/subseries2.1'}}}}

    def setUp(self):
        self.repo = Repository()
        self.pids = []

        #Create a simple Collection
        self.sc = self.repo.get_object(type=SimpleCollection)
        self.sc.label = "SimpleCollection For Test"
        self.sc.save()
        self.pids.append(self.sc.pid)

        #Create a Master Collection
        self.mc = self.repo.get_object(type=CollectionObject)
        self.mc.label = "MasterCollection For Test"
        self.mc.save()
        self.pids.append(self.mc.pid)

        #Create a a DigitalObject
        self.digObj = self.repo.get_object(type=RushdieArrangementFile)
        self.digObj.label = "Object For Test"
        self.digObj.save()
        self.pids.append(self.digObj.pid)
        self.digObj.api.addDatastream(self.digObj.pid, "MARBL-MACTECH",
                                           "MARBL-MACTECH",  mimeType="application/xml", content= self.MM_FIXTURE)
        self.digObj.api.addDatastream(self.digObj.pid, "MARBL-ANALYSIS",
                                           "MARBL-ANALYSIS",  mimeType="application/xml", content= self.MA_FIXTURE)
        #Remove Arrangement model so it can be added later
        relation = (self.digObj.uriref, modelns.hasModel, "info:fedora/emory-control:Arrangement-1.0")
        self.digObj.rels_ext.content.remove(relation)
        self.digObj.save()


        #Setup Command
        self.cmd = migrate_rushdie.Command()
        self.cmd.verbosity = 1
        self.cmd.v_normal = 1
        self.cmd.v_none = 0
        self.cmd.simple_collection = self.sc
        self.cmd.stdout = sys.stdout
        self.cmd.CONTENT_MODELS = CONTENT_MODELS
        self.cmd.repo = self.repo

    def tearDown(self):
        for pid in self.pids:
            self.repo.purge_object(pid)


    def test__add_to_simple_collection(self):
        self.cmd._add_to_simple_collection(self.digObj)
        self.assertTrue((self.sc.uriref, relsextns.hasMember,
                     self.digObj.uriref) in self.sc.rels_ext.content, "%s shold be a member of the Simplecollection" % self.digObj.pid )


    def test__get_unique_objects(self):
        #duplicate pids are processed only once
        objs = self.cmd._get_unique_objects([self.digObj.pid, self.digObj.pid])
        self.assertEqual(len(objs), 1, "No dup pids should be processed")

    def test__convert_ds(self):
        obj = self.cmd._convert_ds(self.digObj, self.mc, self.SERIES_FIXTURE, False)
        #Check all fields are moved over correctly

        #filetech
        self.assertEqual(obj.filetech.content.file[0].md5, "ffcf48e5df673fc7de985e1b859eeeec")
        self.assertEqual(obj.filetech.content.file[0].computer, "Performa 5400")
        self.assertEqual(obj.filetech.content.file[0].path, "/Hard Disk/MIDNIGHT'S CHILDREN/MISC. MATERIAL/x - the roles")
        self.assertEqual(obj.filetech.content.file[0].rawpath, "L0hhcmQgRGlzay9NSUROSUdIVCdTIENISUxEUkVOL01JU0MuIE1BVEVSSUFML3ggLSB0aGUgcm9sZXM=")
        self.assertEqual(obj.filetech.content.file[0].attributes, "avbstclInmedz")
        self.assertEqual(obj.filetech.content.file[0].created, "1997-01-19T19:29:32")
        self.assertEqual(obj.filetech.content.file[0].modified, "1997-01-19T19:29:32")
        self.assertEqual(obj.filetech.content.file[0].type, "TEXT")
        self.assertEqual(obj.filetech.content.file[0].creator, "ttxt")
        #MODS
        self.assertEqual(obj.mods.content.series.title, "Fiction")
        self.assertEqual(obj.mods.content.series.uri, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]["Fiction"]["uri"])
        self.assertEqual(obj.mods.content.series.base_ark, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]["Fiction"]["base_ark"])
        self.assertEqual(obj.mods.content.series.full_id, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]["Fiction"]["id"])
        self.assertEqual(obj.mods.content.series.short_id, self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]["Fiction"]["short_id"])
        self.assertEqual(obj.mods.content.series.series.title, "Writings by Rushdie")
        self.assertEqual(obj.mods.content.series.series.uri, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["uri"])
        self.assertEqual(obj.mods.content.series.series.base_ark, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["base_ark"])
        self.assertEqual(obj.mods.content.series.series.full_id, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["id"])
        self.assertEqual(obj.mods.content.series.series.short_id, self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["short_id"])
        #Rights
        self.assertEqual(obj.rights.content.access_status.code, "2")
        #RELS-EXT
        self.assertTrue((obj.uriref, relsextns.isMemberOf, self.mc.uriref) in obj.rels_ext.content, "Object should have isMember relation to master collection")
        self.assertTrue((obj.uriref, modelns.hasModel, URIRef("info:fedora/emory-control:ArrangementAccessAllowed-1.0")) in obj.rels_ext.content, "Object should have Allowed Content Model")
        #Label and DS
        self.assertEqual(obj.label, "x - the roles", "Label should be set to last part of path")
        self.assertEqual(obj.owner, "thekeep-project", "owner should be set to 'thekeep-project'")
        self.assertEqual(obj.dc.content.title, "x - the roles", "DC title should be set to last part of path")
        #DataStreams
        #have to reload obj from repository to get DS update
        obj = self.repo.get_object(pid=obj.pid, type=ArrangementObject)
        self.assertFalse("MARBL-MACTECH" in obj.ds_list, "MARBL-MACTECH should have been removed")
        self.assertFalse("MARBL-ANALYSIS" in obj.ds_list, "MARBL-ANALYSIS should have been removed")

    def test_missing_series_info(self):
        #Remove subseries info from lookup
        series = self.SERIES_FIXTURE.copy()
        del series["Writings by Rushdie"]["subseries_info"]
        obj = self.cmd._convert_ds(self.digObj, self.mc, self.SERIES_FIXTURE, False)

        self.assertEqual(obj.mods.content.series.title, "Fiction")
        self.assertEqual(obj.mods.content.series.series.title, "Writings by Rushdie")
Beispiel #39
0
def upload(request):
    '''Upload file(s) and create new fedora :class:`~keep.audio.models.AudioObject` (s).
    Only accepts audio/x-wav currently.

    There are two distinct ways to upload file. The first case is
    kicked off when "fileManualUpload" exists in the posted form. If
    it does, then this was not a HTML5 browser, and the file upload
    occurs as is usual for a single file upload.

    In the other approach, the file was uploaded via a HTML5 ajax
    upload already. In this case, we are reading in various hidden
    generated form fields that indicate what was uploaded from the
    javascript code.
    '''
    repo = Repository(request=request)

    ctx_dict = {
        # list of allowed file types, in a format suited for passing to javascript
        'js_allowed_types': mark_safe(json.dumps(allowed_upload_types(request.user)))
    }

    if request.method == 'POST':
        content_type = request.META.get('CONTENT_TYPE', 'application/octet-stream')
        media_type, sep, options = content_type.partition(';')
        # content type is technically case-insensitive; lower-case before comparing
        media_type = media_type.strip().lower()

        # if form has been posted, process & ingest files
        if media_type == 'multipart/form-data':

            # check for a single file upload
            form = UploadForm(request.POST, request.FILES)

            # If form is not valid (i.e., no collection specified, no
            # or mismatched files uploaded), bail out and redisplay
            # form with any error messages.
            if not form.is_valid():
                ctx_dict['form'] = form
                return TemplateResponse(request, 'file/upload.html', ctx_dict)

            # Form is valid. Get collection & check for optional comment
            collection = repo.get_object(pid=form.cleaned_data['collection'],
                                         type=CollectionObject)
            # get user comment if any; default to a generic ingest comment
            comment = form.cleaned_data['comment'] or 'initial repository ingest'
            # get dictionary of file path -> filename, based on form data
            files_to_ingest = form.files_to_ingest()


            # process all files submitted for ingest (single or batch mode)
            if files_to_ingest:
                results = ingest_files(files_to_ingest, collection, comment, request)

                # add per-file ingest result status to template context
                ctx_dict['ingest_results'] = results
                # after processing files, fall through to display upload template

        else:
            # POST but not form data - handle ajax file upload
            return ajax_upload(request)

    # on GET or non-ajax POST, display the upload form
    ctx_dict['form'] = UploadForm()
    # convert list of allowed types for passing to javascript

    return TemplateResponse(request, 'file/upload.html', ctx_dict)
Beispiel #40
0
def batch_set_status(pid, status):
    repo = Repository()
    batch = repo.get_object(pid, type=SimpleCollection)
    # keep track of totals for success and failure
    success = 0
    error = 0

    # translate form status codes to fedora state code
    # TODO: shift this logic to arrangement object for re-use ?
    codes = {'Processed': 'A', 'Accessioned': 'I'}

    # target state for every object in the collection
    if status not in codes:
        err_msg = 'Status %s unknown' % status
        logger.error(err_msg)
        raise Exception(err_msg)
    else:
        state = codes[status]

    # finp all pids associated with this object
    pids = list(batch.rels_ext.content.objects(batch.uriref, relsextns.hasMember))

    for pid in pids:
        try:
            # pass in api from batch object to retain user credentials
            obj = ArrangementObject(batch.api, pid)
            obj.state = state
            obj.save('Marking as %s via SimpleCollection %s'
                     % (status, batch.pid))
            success += 1
        except Exception as e:
            logger.error('Failed to update %s : %s' % (pid, e))
            error += 1

    info = {
        'success': success,
        'error': error,
        'success_plural': '' if success == 1 else 's',
        'error_plural': '' if error == 1 else 's',
        'status': status
    }

    summary_msg = "Successfully updated %(success)s item%(success_plural)s; error updating %(error)s" % info

    # if not all objects were updated correctly, exit with error
    if error > 0:
        raise Exception(summary_msg)

    # FIXME: this is based on the current form logic, but could leave
    # some member items stranded in a different status than the parent object

    batch.mods.content.create_restrictions_on_access()
    batch.mods.content.restrictions_on_access.text = status  # Change collection status
    try:
        batch.save('Marking as %(status)s; updated %(success)s member item%(success_plural)s'
                   % info)

    except Exception as e:
        save_err = "Error updating SimpleCollection %s - %s" % (obj.pid, e)
        logger.error(save_err)
        raise Exception('%s; %s' % (save_err, summary_msg))

    # success
    return 'Successfully updated %(success)s item%(success_plural)s' % info
Beispiel #41
0
    def index_data(self):
        '''Extend the default
        :meth:`eulfedora.models.DigitalObject.index_data` method to
        include additional fields specific to Keep Arrangement
        objects.  Includes collection and archive information, along
        with arrangement id and access status.'''
        # NOTE: we don't want to rely on other objects being indexed in Solr,
        # so index data should not use Solr to find any related object info

        repo = Repository()  # FIXME: use relation from current object instead

        # FIXME: is it worth splitting out descriptive index data here?
        data = super(ArrangementObject, self).index_data()

        if self.has_model(boda.EmailMessage.EMAIL_MESSAGE_CMODEL) or \
          self.has_model(boda.Mailbox.MAILBOX_CONTENT_MODEL):
            data['object_type'] = 'email'
        # elif self.has_model(boda.RushdieFile.RUSHDIE_FILE_CMODEL):
        # data['object_type'] = 'file'
        else:
            # generic fallback
            data['object_type'] = 'born-digital'

        # Collection Info
        if self._deprecated_collection:
            collection = self._deprecated_collection
        elif self.collection:
            collection = self.collection
        else:
            collection = None

        if collection and collection.exists:

            # collection_source_id
            if collection.mods.content.source_id is not None:  # allowed to be 0
                data[
                    'collection_source_id'] = collection.mods.content.source_id
            data['collection_id'] = collection.pid
            try:
                # pull parent & archive collection objects directly from fedora
                data['collection_label'] = collection.label
                # the parent collection of the collection this item belongs to is its archive

                # FIXME: this shouldn't be indexed here; are we actually
                # using it anywhere?
                # if collection.collection:
                #     data['archive_id'] = collection.collection.uri
                #     data['archive_label'] = collection.collection.label

            except RequestFailed as rf:
                logger.error(
                    'Error accessing collection or archive object in Fedora: %s'
                    % rf)

        # Arrangement unique id
        try:
            if self.filetech.content.file:
                if self.filetech.content.file[0].local_id:
                    data["arrangement_id"] = self.filetech.content.file[
                        0].local_id
                if self.filetech.content.file[0].md5:
                    data['content_md5'] = self.filetech.content.file[0].md5
        except Exception as e:
            logging.error(
                "Error getting arrangement id or content MD5 for %s: %s" %
                self.pid, e)

        # rights access status code
        if self.rights.content.access_status:
            data['access_code'] = self.rights.content.access_status.code
            # normally this should be picked up via dc:rights, but arrangement
            # objects don't seem to have DC fields populated
            # NOTE: migrated items don't seem to have rights text set
            if self.rights.content.access_status.text:
                data['rights'] = self.rights.content.access_status.text

        # get simple collections that have an association with this object
        try:
            simple_collections = repo.risearch.get_subjects(
                relsext.hasMember, self.uriref)
            simple_collections = list(simple_collections)

            sc_ids = []
            sc_labels = []

            for sc in simple_collections:
                obj = repo.get_object(pid=sc, type=repo.infer_object_subtype)
                if isinstance(obj, SimpleCollection):
                    sc_ids.append("info:fedora/%s" % obj.pid)
                    sc_labels.append(obj.label)
        except RequestFailed as rf:
            logger.error('Error accessing simpleCollection in Fedora: %s' % rf)

        if sc_ids:
            data["simpleCollection_id"] = sc_ids
        if sc_labels:
            data["simpleCollection_label"] = sc_labels

        return data
Beispiel #42
0
class TestMigrateRushdie(TestCase):
    MM_FIXTURE = '''<macfs:document xmlns:macfs="info:fedora/emory-control:Rushdie-MacFsData-1.0">
  <macfs:md5>ffcf48e5df673fc7de985e1b859eeeec</macfs:md5>
  <macfs:file>
    <macfs:computer>Performa 5400</macfs:computer>
    <macfs:path>/Hard Disk/MIDNIGHT&apos;S CHILDREN/MISC. MATERIAL/x - the roles</macfs:path>
    <macfs:rawpath>L0hhcmQgRGlzay9NSUROSUdIVCdTIENISUxEUkVOL01JU0MuIE1BVEVSSUFML3ggLSB0aGUgcm9sZXM=</macfs:rawpath>
    <macfs:attributes>avbstclInmedz</macfs:attributes>
    <macfs:created>1997-01-19T19:29:32</macfs:created>
    <macfs:modified>1997-01-19T19:29:32</macfs:modified>
    <macfs:type>TEXT</macfs:type>
    <macfs:creator>ttxt</macfs:creator>
  </macfs:file>
</macfs:document>'''

    MA_FIXTURE = '''<marbl:analysis xmlns:marbl="info:fedora/emory-control:Rushdie-MarblAnalysis-1.0">
  <marbl:series>Writings by Rushdie</marbl:series>
  <marbl:subseries>Fiction</marbl:subseries>
  <marbl:verdict>As is</marbl:verdict>
</marbl:analysis>'''

    SERIES_FIXTURE = {
        'Writings by Rushdie': {
            'series_info': {
                'base_ark':
                'http://testpid.library.emory.edu/ark:/25593/80mvk',
                'id':
                'rushdie1000_series2',
                'short_id':
                'series2',
                'uri':
                'https://findingaids.library.emory.edu/documents/rushdie1000/series2'
            },
            'subseries_info': {
                'Fiction': {
                    'base_ark':
                    'http://testpid.library.emory.edu/ark:/25593/80mvk',
                    'id':
                    'rushdie1000_subseries2.1',
                    'short_id':
                    'subseries2.1',
                    'uri':
                    'https://findingaids.library.emory.edu/documents/rushdie1000/series2/subseries2.1'
                }
            }
        }
    }

    def setUp(self):
        self.repo = Repository()
        self.pids = []

        #Create a simple Collection
        self.sc = self.repo.get_object(type=SimpleCollection)
        self.sc.label = "SimpleCollection For Test"
        self.sc.save()
        self.pids.append(self.sc.pid)

        #Create a Master Collection
        self.mc = self.repo.get_object(type=CollectionObject)
        self.mc.label = "MasterCollection For Test"
        self.mc.save()
        self.pids.append(self.mc.pid)

        #Create a a DigitalObject
        self.digObj = self.repo.get_object(type=RushdieArrangementFile)
        self.digObj.label = "Object For Test"
        self.digObj.save()
        self.pids.append(self.digObj.pid)
        self.digObj.api.addDatastream(self.digObj.pid,
                                      "MARBL-MACTECH",
                                      "MARBL-MACTECH",
                                      mimeType="application/xml",
                                      content=self.MM_FIXTURE)
        self.digObj.api.addDatastream(self.digObj.pid,
                                      "MARBL-ANALYSIS",
                                      "MARBL-ANALYSIS",
                                      mimeType="application/xml",
                                      content=self.MA_FIXTURE)
        #Remove Arrangement model so it can be added later
        relation = (self.digObj.uriref, modelns.hasModel,
                    "info:fedora/emory-control:Arrangement-1.0")
        self.digObj.rels_ext.content.remove(relation)
        self.digObj.save()

        #Setup Command
        self.cmd = migrate_rushdie.Command()
        self.cmd.verbosity = 1
        self.cmd.v_normal = 1
        self.cmd.v_none = 0
        self.cmd.simple_collection = self.sc
        self.cmd.stdout = sys.stdout
        self.cmd.CONTENT_MODELS = CONTENT_MODELS
        self.cmd.repo = self.repo

    def tearDown(self):
        for pid in self.pids:
            self.repo.purge_object(pid)

    def test__add_to_simple_collection(self):
        self.cmd._add_to_simple_collection(self.digObj)
        self.assertTrue(
            (self.sc.uriref, relsextns.hasMember, self.digObj.uriref)
            in self.sc.rels_ext.content,
            "%s shold be a member of the Simplecollection" % self.digObj.pid)

    def test__get_unique_objects(self):
        #duplicate pids are processed only once
        objs = self.cmd._get_unique_objects([self.digObj.pid, self.digObj.pid])
        self.assertEqual(len(objs), 1, "No dup pids should be processed")

    def test__convert_ds(self):
        obj = self.cmd._convert_ds(self.digObj, self.mc, self.SERIES_FIXTURE,
                                   False)
        #Check all fields are moved over correctly

        #filetech
        self.assertEqual(obj.filetech.content.file[0].md5,
                         "ffcf48e5df673fc7de985e1b859eeeec")
        self.assertEqual(obj.filetech.content.file[0].computer,
                         "Performa 5400")
        self.assertEqual(
            obj.filetech.content.file[0].path,
            "/Hard Disk/MIDNIGHT'S CHILDREN/MISC. MATERIAL/x - the roles")
        self.assertEqual(
            obj.filetech.content.file[0].rawpath,
            "L0hhcmQgRGlzay9NSUROSUdIVCdTIENISUxEUkVOL01JU0MuIE1BVEVSSUFML3ggLSB0aGUgcm9sZXM="
        )
        self.assertEqual(obj.filetech.content.file[0].attributes,
                         "avbstclInmedz")
        self.assertEqual(obj.filetech.content.file[0].created,
                         "1997-01-19T19:29:32")
        self.assertEqual(obj.filetech.content.file[0].modified,
                         "1997-01-19T19:29:32")
        self.assertEqual(obj.filetech.content.file[0].type, "TEXT")
        self.assertEqual(obj.filetech.content.file[0].creator, "ttxt")
        #MODS
        self.assertEqual(obj.mods.content.series.title, "Fiction")
        self.assertEqual(
            obj.mods.content.series.uri,
            self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]
            ["Fiction"]["uri"])
        self.assertEqual(
            obj.mods.content.series.base_ark,
            self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]
            ["Fiction"]["base_ark"])
        self.assertEqual(
            obj.mods.content.series.full_id,
            self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]
            ["Fiction"]["id"])
        self.assertEqual(
            obj.mods.content.series.short_id,
            self.SERIES_FIXTURE["Writings by Rushdie"]["subseries_info"]
            ["Fiction"]["short_id"])
        self.assertEqual(obj.mods.content.series.series.title,
                         "Writings by Rushdie")
        self.assertEqual(
            obj.mods.content.series.series.uri,
            self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["uri"])
        self.assertEqual(
            obj.mods.content.series.series.base_ark,
            self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]
            ["base_ark"])
        self.assertEqual(
            obj.mods.content.series.series.full_id,
            self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]["id"])
        self.assertEqual(
            obj.mods.content.series.series.short_id,
            self.SERIES_FIXTURE["Writings by Rushdie"]["series_info"]
            ["short_id"])
        #Rights
        self.assertEqual(obj.rights.content.access_status.code, "2")
        #RELS-EXT
        self.assertTrue(
            (obj.uriref, relsextns.isMemberOf, self.mc.uriref)
            in obj.rels_ext.content,
            "Object should have isMember relation to master collection")
        self.assertTrue(
            (obj.uriref, modelns.hasModel,
             URIRef("info:fedora/emory-control:ArrangementAccessAllowed-1.0"))
            in obj.rels_ext.content,
            "Object should have Allowed Content Model")
        #Label and DS
        self.assertEqual(obj.label, "x - the roles",
                         "Label should be set to last part of path")
        self.assertEqual(obj.owner, "thekeep-project",
                         "owner should be set to 'thekeep-project'")
        self.assertEqual(obj.dc.content.title, "x - the roles",
                         "DC title should be set to last part of path")
        #DataStreams
        #have to reload obj from repository to get DS update
        obj = self.repo.get_object(pid=obj.pid, type=ArrangementObject)
        self.assertFalse("MARBL-MACTECH" in obj.ds_list,
                         "MARBL-MACTECH should have been removed")
        self.assertFalse("MARBL-ANALYSIS" in obj.ds_list,
                         "MARBL-ANALYSIS should have been removed")

    def test_missing_series_info(self):
        #Remove subseries info from lookup
        series = self.SERIES_FIXTURE.copy()
        del series["Writings by Rushdie"]["subseries_info"]
        obj = self.cmd._convert_ds(self.digObj, self.mc, self.SERIES_FIXTURE,
                                   False)

        self.assertEqual(obj.mods.content.series.title, "Fiction")
        self.assertEqual(obj.mods.content.series.series.title,
                         "Writings by Rushdie")
Beispiel #43
0
def download_video(request, pid, type, extension=None):
    '''Serve out an video datastream for the fedora object specified by pid.
    Can be used to download original file or the access copy.

    :param pid: pid of the :class:`~keep.vidoe.models.Video` instance
        from which the vidoe datastream should be returned
    :param type: which video datastream to return - should be one of 'original'
        or 'access'
    :param extension: optional filename extension for access copy to
        distinguish between different types of access copies

    The :class:`django.http.HttpResponse` returned will have a Content-Disposition
    set to prompt the user to download the file with a filename based on the
    object noid and an appropriate file extension for the type of video requested.
    '''
    repo = Repository(request=request)
    # retrieve the object so we can use it to set the download filename
    obj = repo.get_object(pid, type=Video)

    # user needs either *play* or *download* permissions
    # - could be any video or researcher-accessible only, which additionally
    #   requires checking object is researcher-accessible
    # for now, use presence of 'HTTP_RANGE' in request to differentiate
    # jplayer requests from straight downloads
    # NOTE: this would not be too difficult for a savvy user to circumvent
    # (if they know what we are checking), but is intended mainly to prevent
    # unwanted access by staff and researchers in the reading room

    # if http range is present in request, check for play permissions
    # (also requires that request is for access copy, not original)
    if 'HTTP_RANGE' in request.META:
        playable = (type == 'access' and
                    (request.user.has_perm('video.play_video')) or
                    (request.user.has_perm('video.play_researcher_video') and
                     bool(obj.researcher_access)))

        if not playable:
            return prompt_login_or_403(request)

    # otherwise, check for download permissions
    else:
        # user either needs download video permissions OR
        # if they can download researcher audio and object must be researcher-accessible
        downloadable = request.user.has_perm('video.download_video') or \
             (request.user.has_perm('video.download_researcher_video') and
              bool(obj.researcher_access))

        if not downloadable:
            return prompt_login_or_403(request)

    # determine which datastream is requsted & set datastream id & file extension
    if type == 'original':
        dsid = Video.content.id
        # set file extension based on the datastream content type,
        # with a fallback for generic binary (should not happen in production)
        file_ext = Video.allowed_master_mimetypes.get(obj.content.mimetype, 'bin')
    elif type == 'access':
        dsid = Video.access_copy.id
        # set file extension based on the datastream content
        file_ext = Video.allowed_access_mimetypes[obj.access_copy.mimetype]
    else:
        # any other type is not supported
        raise Http404
    extra_headers = {
        'Content-Disposition': 'attachment; filename="%s.%s"' % (obj.noid, file_ext)
    }

    # use generic raw datastream view from eulfedora
    return raw_datastream(request, pid, dsid, repo=repo,
        headers=extra_headers)
    # errors accessing Fedora will fall through to default 500 error handling
Beispiel #44
0
def edit(request, pid):
    '''Edit the metadata for a single :class:`~keep.file.models.DiskImage`.'''
    # FIXME: should be generic file (?) or possibly one of several supported files
    repo = Repository(request=request)
    obj = repo.get_object(pid, type=DiskImage)
    try:
        # if this is not actually a disk image, then 404 (object is not available at this url)
        if not obj.has_requisite_content_models:
            raise Http404

        if request.method == 'POST':

            # if data has been submitted, initialize form with request data and object mods
            form = DiskImageEditForm(request.POST, instance=obj)
            if form.is_valid():     # includes schema validation
                # update foxml object with data from the form
                form.update_instance()
                if 'comment' in form.cleaned_data \
                         and form.cleaned_data['comment']:
                     comment = form.cleaned_data['comment']
                else:
                    comment = "update metadata"

                obj.save(comment)
                messages.success(request, 'Successfully updated <a href="%s">%s</a>' % \
                        (reverse('file:edit', args=[pid]), pid))
                # save & continue functionality - same as collection edit
                if '_save_continue' not in request.POST:
                    return HttpResponseSeeOtherRedirect(reverse('repo-admin:dashboard'))
                # otherwise - fall through to display edit form again

            # form was posted but not valid
            else:
                # if we attempted to save and failed, add a message since the error
                # may not be obvious or visible in the first screenful of the form
                messages.error(request,
                    '''Your changes were not saved due to a validation error.
                    Please correct any required or invalid fields indicated below and save again.''')

        else:
            # GET - display the form for editing, pre-populated with content from the object
            form = DiskImageEditForm(instance=obj)

        class AdminOpts(object):
            app_label = 'file'
            model_name = 'application'

        # options for generating admin link to edit/add file application db info
        admin_fileapp = AdminOpts()

        return TemplateResponse(request, 'file/edit.html', {'obj': obj, 'form': form,
            'admin_fileapp': admin_fileapp})

    except PermissionDenied:
        # Fedora may return a PermissionDenied error when accessing a datastream
        # where the datastream does not exist, object does not exist, or user
        # does not have permission to access the datastream

        # check that the object exists - if not, 404
        if not obj.exists:
            raise Http404
        # for now, assuming that if object exists and has correct content models,
        # it will have all the datastreams required for this view

        return HttpResponseForbidden('Permission Denied to access %s' % pid,
                                     content_type='text/plain')

    except RequestFailed as rf:
        # if fedora actually returned a 404, propagate it
        if rf.code == 404:
            raise Http404

        msg = 'There was an error contacting the digital repository. ' + \
              'This prevented us from accessing audio data. If this ' + \
              'problem persists, please alert the repository ' + \
              'administrator.'
        return HttpResponse(msg, content_type='text/plain', status=500)
Beispiel #45
0
def largefile_ingest(request):
    '''Large-file ingest.  On GET, displays a form allowing user to
    select a BagIt that has been uploaded to the configured large-file
    ingest staging area for ingest and association with a collection.
    '''
    # ingest content from upload staging area

    context = {}
    template_name = 'file/largefile_ingest.html'
    form = None

    # on POST, process the form and ingest if valid
    if request.method == 'POST':
        form = LargeFileIngestForm(request.POST)

        # if form is not valid, add to context for redisplay with errors
        if not form.is_valid():
            context['form'] = form

        # otherwise, process the form
        else:
            repo = Repository(request=request)

            # Get collection & check for optional comment
            collection = repo.get_object(pid=form.cleaned_data['collection'],
                                         type=CollectionObject)
            # get user comment if any; default to a generic ingest comment
            comment = form.cleaned_data[
                'comment'] or 'initial repository ingest'
            bag = form.cleaned_data['bag']

            # create dict with file info to add success/failure info
            file_info = {'label': os.path.basename(bag)}

            #assuming type of ingest from subdirectory
            type = bag.split('/')[-2]
            try:

                if type == 'diskimage':
                    obj = DiskImage.init_from_bagit(bag, request)

                if type == 'video':
                    obj = Video.init_from_bagit(bag, request)

                # set collection on ingest
                obj.collection = collection

                ## NOTE: Due to a bug in Fedora 3.4 with checksums and
                ## and file uri ingest, the content datastream checksum
                ## must be cleared before ingest; manually check it
                ## after ingest to confirm Fedora calculated what we expect.
                ## This work-around can be removed once we upgrade to Fedora 3.6

                # store datastream checksum that would be sent to fedora
                checksum = obj.content.checksum
                obj._content_checksum = checksum
                # clear it out so Fedora can ingest without erroring
                obj.content.checksum = None

                # file URIs also used for supplemental files; needs
                # to be handled the same way as content datastream
                # - look for any supplementN datastreams, store checksum, and remove
                supplemental_checksums = {}
                for i in range(20):
                    try:
                        dsid = 'supplement%d' % i
                        dsobj = getattr(obj, dsid)
                        supplemental_checksums[dsid] = dsobj.checksum
                        dsobj.checksum = None
                    except AttributeError:
                        # stop iterating - we have found last supplemental file
                        break

                # same for access copy checksum on Video files
                if type == 'video':
                    access_checksum = obj.access_copy.checksum
                    obj.access_copy.checksum = None

                obj.save(comment)

                # remove the ingested bag from large-file staging area
                shutil.rmtree(bag)

                # re-init to allow checking fedora-calculated checksums on
                # supplemental datastreams
                if type == 'diskimage':
                    obj = repo.get_object(obj.pid, type=DiskImage)
                elif type == 'video':
                    obj = repo.get_object(obj.pid, type=Video)

                # if save succeded (no exceptions), set summary info for display
                file_info.update({
                    'type': type,
                    'success': True,
                    'pid': obj.pid,
                    'url': obj.get_absolute_url(),
                    'checksum': obj.content.checksum
                })
                if type == 'video':
                    file_info['access_checksum'] = obj.access_copy.checksum

                # compare checksum generated by Fedora
                # (required because of file uri bug in fedora 3.4;
                #  this can be removed once we upgrade to fedora 3.6+)
                checksum_errors = []

                if obj.content.checksum != checksum:
                    checksum_errors.append('content')

                for dsid, checksum in supplemental_checksums.iteritems():
                    dsobj = obj.getDatastreamObject(dsid)
                    if dsobj.checksum != checksum:
                        checksum_errors.append(dsid)

                if type == 'video' and obj.access_copy.checksum != access_checksum:
                    checksum_errors.append('access_copy')

                if checksum_errors:
                    message = 'Checksum mismatch%s detected on ' + \
                       '%s datastream%s; please contact a repository administrator.'''
                    file_info['message'] = message % (
                        'es' if len(checksum_errors) > 1 else '',
                        ', '.join(checksum_errors),
                        's' if len(checksum_errors) > 1 else '')

            except bagit.BagValidationError as err:
                logger.error(err)
                file_info.update({
                    'success': False,
                    'message': 'BagIt error: %s' % err
                })

            # special case: detected as duplicate content
            except DuplicateContent as e:
                # mark as failed and generate message with links to records
                # NOTE: pid url is duplicated logic from web upload view...
                links = []
                for pid in e.pids:
                    # use fedora type-inferring logic with list of content models
                    # pulled from solr results
                    obj = repo.get_object(pid,
                                          type=repo.best_subtype_for_object(
                                              pid, e.pid_cmodels[pid]))
                    # use appropriate object class to get the object url
                    links.append('<a href="%s">%s</a>' %
                                 (obj.get_absolute_url(), pid))
                msg = mark_safe('%s: %s' % (unicode(e), '; '.join(links)))
                file_info.update({'success': False, 'message': msg})

            except Exception as err:
                logger.error('Error: %s' % err)
                file_info.update({'success': False, 'message': '%s' % err})

            # report success/failure in the same format as web-upload ingest
            context['ingest_results'] = [file_info]

    # on GET display form to select item(s) for ingest
    # OR on completed valid form post
    files = large_file_uploads()
    if request.method == 'GET' or \
      form is not None and form.is_valid():
        if len(files):
            context['form'] = LargeFileIngestForm()
        else:
            # indicator that no files are available for ingest
            context['no_files'] = True

    return TemplateResponse(request, template_name, context)
Beispiel #46
0
def manage_supplements(request, pid):
    '''Manage supplemental file datastreams associated with a
    :class:`~keep.file.models.DiskImage`.'''
    repo = Repository(request=request)
    obj = repo.get_object(pid, type=DiskImage)
    if not obj.exists or not obj.has_requisite_content_models:
        raise Http404

    # generate initial data from any existing supplemental datastreams
    initial_data = []
    for s in obj.supplemental_content:
        initial_data.append({'dsid': s.id, 'label': s.label,
            'file': DatastreamFile(obj.pid, s.id, s.label)})

    # on get, just display the form
    if request.method == 'GET':
        formset = SupplementalFileFormSet(initial=initial_data)

    # on post, process the form and any updates/additions
    if request.method == 'POST':
        formset = SupplementalFileFormSet(request.POST, request.FILES,
            initial=initial_data)

        if formset.is_valid():
            m = magic.Magic(mime=True)

            # NOTE: because we currently don't support re-ordering
            # or deletion, simply counting to keep track of datastream ids
            s_id = 0
            modified = 0
            added = 0
            for file_info in formset.cleaned_data:
                # skip empty formset
                if not file_info:
                    continue

                if file_info.get('dsid', None):
                    ds = obj.getDatastreamObject(file_info['dsid'],
                        dsobj_type=FileDatastreamObject)
                    # ds = getattr(obj, file_info['dsid'])
                else:
                    added += 1
                    ds = obj.getDatastreamObject('supplement%d' % s_id,
                        dsobj_type=FileDatastreamObject)

                # only set if changed so datastream isModified is accurate
                if file_info['label'] != ds.label:
                    ds.label = file_info['label']

                # if this is an uploaded file, replace content and calculate mimetype, checksum
                if isinstance(file_info['file'], UploadedFile):

                    filename = file_info['file'].temporary_file_path()
                    mimetype = m.from_file(filename)
                    mimetype, separator, options = mimetype.partition(';')
                    ds.mimetype = mimetype
                    ds.checksum = md5sum(filename)
                    ds.content = file_info['file']

                if ds.exists and ds.isModified():
                    modified += 1

                s_id += 1

            try:
                obj.save('updating supplemental files')

                # summarize number of changes, if any
                if added or modified:
                    msg_add = 'added %d' % added if added else ''
                    msg_update = 'updated %d' % modified if modified else ''
                    msg = 'Successfully %s%s%s supplemental file%s' %  \
                        (msg_add, ' and ' if added and modified else '', msg_update,
                        's' if (added + modified) != 1 else '')
                    messages.success(request, msg)
                else:
                    # possible for the form to be valid but not make any changes
                    messages.info(request, 'No changes made to supplemental content')

                return HttpResponseSeeOtherRedirect(reverse('file:edit', args=[pid]))

            except Exception as e:
                logger.error('Error on supplemental file update: %s' % e)
                logger.debug("Error details:\n" + traceback.format_exc())

                messages.error(request, unicode(e))
                # for now, just redisplay the form with error message

    return TemplateResponse(request, 'file/supplemental_content.html',
        {'obj': obj, 'formset': formset})
Beispiel #47
0
class ArrangementObjectTest(KeepTestCase):
    def setUp(self):
        self.repo = Repository()
        self.pids = []

        # create test collection
        coll = self.repo.get_object(type=CollectionObject)
        coll.pid = '%s:parent-1' % settings.FEDORA_PIDSPACE
        coll.mods.content.source_id = '12345'
        coll.save()
        self.pids.append(coll.pid)

        #create test arrangement object
        self.arr = self.repo.get_object(type=ArrangementObject)
        self.arr.pid = 'foo:1'
        self.arr.collection = coll

    def tearDown(self):
        for pid in self.pids:
            self.repo.purge_object(pid)

    @patch('keep.arrangement.models.solr_interface',
           spec=sunburnt.SolrInterface)
    def test_by_arrangement_id(self, mocksolr):
        # no match
        self.assertRaises(ObjectDoesNotExist,
                          ArrangementObject.by_arrangement_id, 42)
        solr = mocksolr.return_value
        solr.query.assert_called_with(
            arrangement_id=42,
            content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL)
        solr.query.return_value.field_limit.assert_called_with('pid')

        # too many matches
        solr.query.return_value.field_limit.return_value = [{
            'pid': 'pid:1'
        }, {
            'pid': 'pid:2'
        }]
        self.assertRaises(MultipleObjectsReturned,
                          ArrangementObject.by_arrangement_id, 42)

        # one match
        solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}]
        ao = ArrangementObject.by_arrangement_id(42)
        self.assert_(isinstance(ao, ArrangementObject))

        # custom repo object
        mockrepo = Mock()
        ao = ArrangementObject.by_arrangement_id(42, mockrepo)
        mockrepo.get_object.assert_called_with('pid:1', type=ArrangementObject)

    def test_arrangement_status(self):
        obj = ArrangementObject(Mock())
        obj.arrangement_status = 'processed'
        self.assertEqual('A', obj.state)
        self.assertEqual('processed', obj.arrangement_status)

        obj.arrangement_status = 'accessioned'
        self.assertEqual('I', obj.state)
        self.assertEqual('accessioned', obj.arrangement_status)

        value_error = None
        try:
            obj.arrangement_status = 'bogus'
        except ValueError:
            value_error = True

        self.assertTrue(
            value_error,
            'attempting to assign an unknown status should raise a ValueError')

    def test_update_access_cmodel(self):
        obj = ArrangementObject(Mock())
        # no status set - should be set to restricted
        obj._update_access_cmodel()

        self.assert_(
            (obj.uriref, modelns.hasModel,
             URIRef(ACCESS_RESTRICTED_CMODEL)) in obj.rels_ext.content)
        self.assert_(
            (obj.uriref, modelns.hasModel,
             URIRef(ACCESS_ALLOWED_CMODEL)) not in obj.rels_ext.content)

        # set to status code 2 = access allowed
        obj.rights.content.create_access_status()
        obj.rights.content.access_status.code = '2'

        obj._update_access_cmodel()

        self.assert_(
            (obj.uriref, modelns.hasModel,
             URIRef(ACCESS_RESTRICTED_CMODEL)) not in obj.rels_ext.content)
        self.assert_((obj.uriref, modelns.hasModel,
                      URIRef(ACCESS_ALLOWED_CMODEL)) in obj.rels_ext.content)

    def test_index_data(self):
        idx_data = self.arr.index_data()
        self.assertEqual('born-digital', idx_data['object_type'])
        self.assertEqual(self.arr.pid, idx_data['pid'])
        self.assertIn(self.arr.owner, idx_data['owner'])
        self.assertEquals(self.arr.collection.pid, idx_data['collection_id'])
        self.assertEquals(self.arr.collection.mods.content.source_id,
                          idx_data['collection_source_id'])

    # Test the update_ark_label method in the keep.common.fedora
    # Note that this test is a simplified version of keep.common.fedora:ArkPidDigitalObject.test_update_ark_label
    # The udpate_ark_label here is an overriden method that is more specifc, and is used on Arrangement objects
    @patch('keep.arrangement.models.pidman'
           )  # mock the pidman client (the API service)
    def test_update_ark_label(self, mockpidman):

        # Create a ArrangementObject
        arrangement_object = ArrangementObject(Mock())

        # Set a pid on the object so that it could internally generate a noid etc.
        arrangement_object.pid = "test:1234"

        # Simulate when the object doesn't exist (or hasn't been saved)
        # By default it appears as if it doesn't exist
        arrangement_object.update_ark_label()

        # What we should expect is that the update_ark_label is not called on pidman
        # Also there shouldn't be any errors
        # Use the mock assertFalse to check if a method is called or not
        self.assertFalse(mockpidman.get_ark.called)

        # Mock when the object exists (returns True)
        # Note: Need to set the Mock on the class and not the object because
        # this (exists) is a property method
        with patch.object(ArrangementObject,
                          'exists',
                          new=Mock(return_value=True)):
            arrangement_object.update_ark_label()
            self.assertFalse(mockpidman.get_ark.called)

        # Set the label before the object exists so we don't trigger API calls
        arrangement_object.dc.content.title = "testpid"
        with patch.object(ArrangementObject,
                          'exists',
                          new=Mock(return_value=True)):
            mockpidman.get_ark.return_value = {
                "name": arrangement_object.dc.content.title
            }
            arrangement_object.update_ark_label()
            mockpidman.get_ark.assert_called_with(
                arrangement_object.noid
            )  # assert that it is called with a noid too
            self.assertFalse(mockpidman.update_ark.called)

            # When the label is different from that in Pidman
            mockpidman.get_ark.return_value = {"name": "another pid"}
            arrangement_object.update_ark_label()
            mockpidman.get_ark.assert_called_with(
                arrangement_object.noid
            )  # assert that it is called with a noid too
            mockpidman.update_ark.assert_called_with(
                noid=arrangement_object.noid,
                name=arrangement_object.dc.content.title)

    def test_set_premis_object(self):
        mockapi = Mock()
        arrangement_object = ArrangementObject(mockapi)
        arrangement_object.pid = "test:1234"
        arrangement_object.mods.content.ark = 'ark:/1234/987'

        # return empty iterator for original data to checksum
        mockapi.getDatastreamDissemination.return_value = []
        with patch.object(arrangement_object,
                          'getDatastreamObject') as mockgetds:
            mockgetds.return_value.checksum = '123456789'
            mockgetds.return_value.mimetype = 'text/plain'
            arrangement_object.set_premis_object()

        self.assert_(arrangement_object.provenance.content.object)
        premis = arrangement_object.provenance.content
        # FIXME: placeholder tests for placeholder functionality,
        # should be updated to use ARK uri once that is implemented
        self.assertEqual('ark', premis.object.id_type)
        self.assertEqual(arrangement_object.mods.content.ark, premis.object.id)
        self.assertEqual('p:file', premis.object.type)
        self.assertEqual(0, premis.object.composition_level)
        self.assertEqual('MD5', premis.object.checksums[0].algorithm)
        self.assertEqual('123456789', premis.object.checksums[0].digest)
        # sha1 for an empty file
        empty_sha1 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'
        self.assertEqual('SHA-1', premis.object.checksums[1].algorithm)
        self.assertEqual(empty_sha1, premis.object.checksums[1].digest)
        # object format should be original mietype
        self.assertEqual('text/plain', premis.object.format.name)

        # generated premis should be valid
        self.assertTrue(premis.is_valid())

    def test_identifier_change_event(self):
        mockapi = Mock()
        mockapi.username = '******'
        arrangement_object = ArrangementObject(mockapi)
        arrangement_object.pid = 'test:1234'
        arrangement_object.mods.content.ark = 'ark:/1234/987'

        # set object premis so we can validate
        mockapi.getDatastreamDissemination.return_value = []
        with patch.object(arrangement_object,
                          'getDatastreamObject') as mockgetds:
            mockgetds.return_value.checksum = '123456789'
            mockgetds.return_value.mimetype = 'text/plain'
            arrangement_object.set_premis_object()

        arrangement_object.identifier_change_event('old-pid:1')
        premis = arrangement_object.provenance.content
        self.assertEqual(1, len(premis.events))
        event = premis.events[0]
        self.assertEqual('UUID', event.id_type)
        # id should be set, we don't care what it is exactly
        self.assert_(event.id)
        self.assertEqual('identifier assignment', event.type)
        self.assertEqual('program="keep"; version="%s"' % __version__,
                         event.detail)
        self.assertEqual('Pass', event.outcome)
        msg = 'Persistent identifier reassigned from %s to %s' % \
            ('old-pid:1', arrangement_object.pid)
        self.assertEqual(msg, event.outcome_detail)
        self.assertEqual('fedora user', event.agent_type)
        self.assertEqual('fedoraAdmin', event.agent_id)

        # generated premis should be valid
        self.assertTrue(premis.is_valid())
Beispiel #48
0
def download_audio(request, pid, type, extension=None):
    '''Serve out an audio datastream for the fedora object specified by pid.
    Can be used to download original (WAV) audio file or the access copy (MP3).

    :param pid: pid of the :class:`~keep.audio.models.AudioObject` instance
        from which the audio datastream should be returned
    :param type: which audio datastream to return - should be one of 'original'
        or 'access'
    :param extension: optional filename extension for access copy to
        distinguish between different types of access copies (currently MP3 or M4A)

    The :class:`django.http.HttpResponse` returned will have a Content-Disposition
    set to prompt the user to download the file with a filename based on the
    object noid and an appropriate file extension for the type of audio requested.
    '''
    repo = Repository(request=request)
    # retrieve the object so we can use it to set the download filename
    obj = repo.get_object(pid, type=AudioObject)

    # user needs either *play* or *download* permissions
    # - could be any audio or researcher-accessible only, which additionally
    #   requires checking object is researcher-accessible
    # for now, use presence of 'HTTP_RANGE' in request to differentiate
    # jplayer requests from straight downloads
    # NOTE: this would not be too difficult for a savvy user to circumvent
    # (if they know what we are checking), but is intended mainly to prevent
    # unwanted access by staff and researchers in the reading room

    # if http range is present in request, check for play permissions
    # (also requires that request is for access copy, not original)
    if 'HTTP_RANGE' in request.META:
        if not (request.user.has_perm('audio.play_audio') and type == 'access') and \
               not (request.user.has_perm('audio.play_researcher_audio') and \
                    bool(obj.researcher_access) and type == 'access'):
            return prompt_login_or_403(request)

    # otherwise, check for download permissions
    else:
        # user either needs download audio permissions OR
        # if they can download researcher audio and object must be researcher-accessible
        if not request.user.has_perm('audio.download_audio') and \
               not (request.user.has_perm('audio.download_researcher_audio') and \
                    bool(obj.researcher_access)):
            return prompt_login_or_403(request)

    # determine which datastream is requsted & set datastream id & file extension
    if type == 'original':
        dsid = AudioObject.audio.id
        file_ext = 'wav'
    elif type == 'access':
        dsid = AudioObject.compressed_audio.id
        # make sure the requested file extension matches the datastream
        if (obj.compressed_audio.mimetype == 'audio/mp4' and \
           extension != 'm4a') or \
           (obj.compressed_audio.mimetype == 'audio/mpeg' and \
           extension != 'mp3'):
            raise Http404
        file_ext = extension
    else:
        # any other type is not supported
        raise Http404
    extra_headers = {
        'Content-Disposition':
        'attachment; filename="%s.%s"' % (obj.noid, file_ext)
    }
    # use generic raw datastream view from eulfedora
    return raw_datastream(request, pid, dsid, repo=repo, headers=extra_headers)
Beispiel #49
0
class Command(BaseCommand):
    '''Read CSV file and creates (or adds to) a Simple Collection and associated ArrangementObjects
    with the SimpleCollection and the Master collection'''
    def get_password_option(option, opt, value, parser):
        setattr(parser.values, option.dest, getpass())

    #Set up additional options
    option_list = BaseCommand.option_list + (
        make_option(
            '--noact',
            '-n',
            action='store_true',
            dest='no-act',
            default=False,
            help=
            'Does not create PIDs or ingest anything into Fedora. Only parses file and outputs results'
        ),
        make_option(
            '--add',
            '-a',
            action='store',
            dest='add',
            help=
            'adds to the SimpleCollection specified by pid, does not create a new SimpleCollection'
        ),
        make_option('--username',
                    '-u',
                    dest='username',
                    action='store',
                    help='''Username to connect to fedora'''),
        make_option(
            '--password',
            dest='password',
            action='callback',
            callback=get_password_option,
            help='''Prompt for password required when username used'''),
    )

    args = '<CSV file> <master collection pid> <new simple collection name>'
    help = __doc__

    def _create_series_lookup(self):
        #series / subseries info
        self.series = {}

        #exist query params
        return_fields = ['eadid']
        search_fields = {'eadid': 'rushdie1000'}

        queryset = Series.objects.also(*return_fields).filter(**search_fields)
        for s in queryset:
            #series info
            self.series[s.title] = {}
            self.series[s.title]['series_info'] = {}
            self.series[s.title]['series_info']['id'] = s.id
            self.series[s.title]['series_info']['short_id'] = s.short_id
            self.series[s.title]['series_info']['base_ark'] = s.eadid.url
            self.series[s.title]['series_info']['uri'] = "https://findingaids.library.emory.edu/documents/%s/%s" % \
                (s.eadid.value, s.short_id)
            #subseries info
            if s.subseries:
                self.series[s.title]['subseries_info'] = {}
                for sub in s.subseries:
                    self.series[s.title]['subseries_info'][sub.title] = {}
                    self.series[s.title]['subseries_info'][
                        sub.title]['id'] = sub.id
                    self.series[s.title]['subseries_info'][
                        sub.title]['short_id'] = sub.short_id
                    self.series[s.title]['subseries_info'][
                        sub.title]['base_ark'] = s.eadid.url
                    self.series[s.title]['subseries_info'][sub.title]['uri'] = "https://findingaids.library.emory.edu/documents/%s/%s/%s" % \
                    (s.eadid.value, s.short_id, sub.short_id)

    def _create_arrangement(self, row):
        #Account for unicode characters
        #Preserve unicode characters for raw path,
        #but remove unicode character for other mappings
        rawpath = base64.encodestring(row["filename"])

        path = row["filename"]
        path = unicode(path, 'utf8')
        creator = row["creator"]
        creator = unicode(creator, 'utf8')

        # set values in filetech DS
        obj = self.repo.get_object(type=ArrangementObject)
        obj.label = path.rpartition('/')[2]
        obj.filetech.content.file.append(FileMasterTech_Base())
        obj.filetech.content.file[0].local_id = row['id']
        obj.filetech.content.file[0].md5 = row['checksum']
        obj.filetech.content.file[0].computer = row['computer']
        obj.filetech.content.file[0].path = path
        obj.filetech.content.file[0].rawpath = rawpath
        obj.filetech.content.file[0].attributes = row['attrib']
        obj.filetech.content.file[0].created = row['created']
        obj.filetech.content.file[0].modified = row['modified']
        obj.filetech.content.file[0].creator = creator

        #map DC title
        obj.dc.content.title = path.rpartition('/')[2]

        #map default verdict of 10 "Undetermined" in rights DS
        obj.rights.content.create_access_status()
        obj.rights.content.access_status.code = "10"

        #map series in MODS
        #RecordType used to lookup series info
        rec_type = row["rec_type"]
        rec_type = rec_type.strip()
        if rec_type not in self.series:
            rec_type = None

        if rec_type is not None:
            obj.mods.content.create_series()
            obj.mods.content.series.title = rec_type
            obj.mods.content.series.uri = self.series[rec_type]["series_info"][
                "uri"]
            obj.mods.content.series.base_ark = self.series[rec_type][
                "series_info"]["base_ark"]
            obj.mods.content.series.full_id = self.series[rec_type][
                "series_info"]["id"]
            obj.mods.content.series.short_id = self.series[rec_type][
                "series_info"]["short_id"]
        else:
            if self.verbosity > self.v_none:
                self.stdout.write("Series %s not found\n" % row["rec_type"])

        # set association to master collection
        relation = (obj.uriref, relsextns.isMemberOf, self.master_obj.uriref)
        obj.rels_ext.content.add(relation)
        if self.verbosity > self.v_normal:
            self.stdout.write(
                "Adding %s isMemberOf %s relation on ArrangementObject\n" %
                (obj.label, self.master_obj.pid))

        #set state to inactive by default
        obj.state = "I"
        return obj

    def handle(self, *args, **options):
        #collect arrangement pids here to delete later if SimpleCollection fails to save
        self.arrangement_pids = []
        self._create_series_lookup()

        #0 = none, 1 = normal, 2 = all
        self.v_none = 0
        self.v_normal = 1

        if 'verbosity' in options:
            self.verbosity = int(options['verbosity'])
        else:
            self.verbosity = self.v_normal
        #Create the repo
        repo_args = {}
        if options.get('username') is not None:
            repo_args['username'] = options.get('username')
        if options.get('password') is not None:
            repo_args['password'] = options.get('password')
        self.repo = Repository(**repo_args)

        #Check to make sure all args and options are present
        try:
            file = args[0]
        except IndexError:
            raise CommandError("No CSV file specified")

        try:
            self.master_pid = args[1]
        except IndexError:
            raise CommandError("No master collection pid specified")

        #if -a or --add is used the new SimpleCollection name is ignored
        try:
            if not options["add"]:
                self.simple_collection_name = args[2]
            else:
                self.simple_collection_pid = options["add"]

        except IndexError:
            raise CommandError(
                "An existing SimpleCollection pid must be specified with the -a option or \
            a new SimpleCollection name must be specified as an argument")

        #If Master collection does not exist then raise an exception
        self.master_obj = self.repo.get_object(type=CollectionObject,
                                               pid=self.master_pid)

        if not self.master_obj.exists:
            raise CommandError("Master Collection %s does not exist" %
                               (self.master_pid))
        else:
            if self.verbosity > self.v_none:
                self.stdout.write("Using Master Collection: %s(%s)\n" %
                                  (self.master_obj.label, self.master_obj.pid))

        #Get or create SimpleColletion object
        #TODO Not sure why I have to do a try block to prevent a 404 here when I don't in other places
        try:
            if options["add"]:
                simple_collection = self.repo.get_object(
                    type=SimpleCollection, pid=self.simple_collection_pid)
            else:
                simple_collection = self.repo.get_object(type=SimpleCollection)
                simple_collection.label = self.simple_collection_name
                simple_collection.dc.content.title = self.simple_collection_name
                simple_collection.mods.content.create_restrictions_on_access()
                simple_collection.mods.content.restrictions_on_access.text = "Accessioned"
        except:
            raise CommandError("Pid %s does not exist" %
                               self.simple_collection_pid)

        #try to read file into a dict and assign the field names
        try:
            reader = csv.DictReader(open(file, 'rb'),
                                    fieldnames=[
                                        "id", "checksum", "filename",
                                        "rec_type", "file_type", "creator",
                                        "attrib", "created", "modified",
                                        "computer", "size"
                                    ])
            if self.verbosity > self.v_none:
                self.stdout.write("Reading CSV: %s\n" % (file))
        except IOError:
            raise CommandError("Could not read file %s" % file)

        # skip the header row in CSV file
        reader.next()

        #read each field
        csv_read = 0
        arrangement_saved = 0
        errors = 0
        for row in reader:
            try:
                csv_read += 1
                arrangement_object = self._create_arrangement(row)

                if not options['no-act']:
                    try:
                        arrangement_object.save()
                        arrangement_saved += 1
                        self.arrangement_pids.append(arrangement_object.pid)
                        if self.verbosity > self.v_none:
                            self.stdout.write(
                                "Saved ArrangementObject %s(%s)\n" %
                                (arrangement_object.label,
                                 arrangement_object.pid))
                    except Exception as e:
                        if self.verbosity > self.v_none:
                            self.stdout.write(
                                "Error saving ArrangementObject %s: %s\n" %
                                (arrangement_object.label, e.message))
                        errors += 1
                else:
                    if self.verbosity > self.v_none:
                        self.stdout.write("TEST ArrangementObject %s\n" %
                                          (arrangement_object.label))

                if self.verbosity > self.v_normal:
                    self.stdout.write("===RELS-EXT===\n")
                    for entry in arrangement_object.rels_ext.content:
                        self.stdout.write("%s\n" % list(entry))
                    self.stdout.write("===MODS===\n")
                    self.stdout.write(
                        "%s\n" % arrangement_object.mods.content.serialize())

                #Add each ArrangementObject to the SimpleCollection
                relation = (simple_collection.uriref, relsextns.hasMember,
                            arrangement_object.uriref)
                simple_collection.rels_ext.content.add(relation)
                if self.verbosity > self.v_normal:
                    self.stdout.write(
                        "Adding hasMember %s relation on SimpleCollection\n" %
                        (arrangement_object.pid))
            except Exception as e:
                self.stdout.write("Error in record id %s: %s\n" %
                                  (row["id"], e))
                errors += 1

        if not options['no-act']:
            try:
                simple_collection.save()
                self.stdout.write(
                    "Saved SimpleCollection %s(%s)\n" %
                    (simple_collection.label, simple_collection.pid))
            except Exception as e:
                if self.verbosity > self.v_none:
                    self.stdout.write(
                        "Error saving SimpleCollection %s: %s\n" %
                        (simple_collection.label, e.message))
                    self.stdout.write(
                        "Deleting Arrangement pids so they will not be Orphans\n"
                    )
                errors += 1
                for pid in self.arrangement_pids:
                    self.repo.purge_object(pid)
                    if self.verbosity > self.v_none:
                        self.stdout.write("Deleting: %s\n" % (pid))
                    arrangement_saved -= 1

        else:
            if self.verbosity > self.v_none:
                self.stdout.write("TEST SimpleCollection %s\n" %
                                  (simple_collection.label))

        if self.verbosity > self.v_normal:
            self.stdout.write("===RELS-EXT===\n")
            for entry in simple_collection.rels_ext.content:
                self.stdout.write("%s\n" % list(entry))
            self.stdout.write("===DC===\n")
            self.stdout.write("%s\n" %
                              simple_collection.dc.content.serialize())
            self.stdout.write("===MODS===\n")
            self.stdout.write("%s\n" %
                              simple_collection.mods.content.serialize())

        #print Summary
        self.stdout.write("\n\nSUMMARY\n=======\n")
        self.stdout.write("SimpleCollection: %s(%s)\n" %
                          (simple_collection.label, simple_collection.pid))
        self.stdout.write("Master Collection Object: %s(%s)\n" %
                          (self.master_obj.label, self.master_obj.pid))
        self.stdout.write("%s Records read from CSV file\n" % (csv_read))
        self.stdout.write("%s Records created\n" % (arrangement_saved))
        self.stdout.write("%s Errors\n" % (errors))
Beispiel #50
0
def migrate_aff_diskimage(self, pid):
    creating_application = 'AccessData FTK Imager'
    application_version = 'v3.1.1 CLI'
    migration_event_detail = 'program="%s"; version="%s"' % \
        (creating_application, application_version)
    migration_event_outcome = 'AFF reformatted as E01 using command line ' + \
        'FTK program with settings: --e01 --compress 0 --frag 100T --quiet'

    # use the configured ingesting staging area as the base tmp dir
    # create
    # for all temporary files
    staging_dir = getattr(settings, 'LARGE_FILE_STAGING_DIR', None)
    # create a tempdir within the large file staging area
    tmpdir = tempfile.mkdtemp(suffix='-aff-migration', dir=staging_dir)
    logger.debug('Using tmpdir %s', tmpdir)

    # Retrieve the object to be migrated
    repo = Repository()
    original = repo.get_object(pid, type=DiskImage)

    # check object before migrating
    # - exists in fedora
    if not original.exists:
        # raise Exception
        raise Exception('%s not found in Fedora' % original.pid)
    # - is a disk image
    if not original.has_requisite_content_models:
        raise Exception('%s is not a DiskImage object' % original.pid)
    # - is an AFF disk image
    if original.provenance.content.object.format.name != 'AFF':
        raise Exception('%s DiskImage format is not AFF' % original.pid)
    # - has not already been migrated
    if original.migrated is not None:
        raise Exception('%s has already been migrated' % original.pid)

    # download the aff disk image to a tempfile
    aff_file = tempfile.NamedTemporaryFile(suffix='.aff',
                                           prefix='keep-%s_' % original.noid,
                                           dir=tmpdir,
                                           delete=False)
    logger.debug('Saving AFF as %s for conversion (datastream size: %s)' \
        % (aff_file.name, filesizeformat(original.content.size)))
    try:
        for chunk in original.content.get_chunked_content():
            aff_file.write(chunk)
    except Exception as err:
        raise Exception('Error downloading %s AFF for conversion' %
                        original.pid)

    # close the file handle in case of weird interactions with ftkimager
    aff_file.close()
    aff_size = os.path.getsize(aff_file.name)
    logger.debug('Downloaded %s' % filesizeformat(aff_size))

    # run ftkimager to generate the E01 version
    logger.debug('Running ftkimager to generate E01')
    e01_file = tempfile.NamedTemporaryFile(suffix='.E01',
                                           prefix='keep-%s_' % original.noid,
                                           dir=tmpdir,
                                           delete=False)
    # close the file handle in case of weird interactions with ftkimager
    e01_file.close()
    # file handle to capture console output from ftkimager
    ftk_output = tempfile.NamedTemporaryFile(suffix='.txt',
                                             prefix='keep-%s-ftkimager_' %
                                             original.noid,
                                             dir=tmpdir)
    logger.debug('E01 temp file is %s' % e01_file.name)
    logger.debug('ftkimager output temp file is %s' % ftk_output.name)
    # ftkimager adds .E01 to the specified filename, so pass in filename without
    e01_file_basename, ext = os.path.splitext(e01_file.name)

    convert_command = [
        'ftkimager', aff_file.name, e01_file_basename, '--e01', '--compress',
        '0', '--frag', '100T', '--quiet'
    ]
    # quiet simply suppresses progress output, which is not meaningful
    # in a captured text file
    logger.debug('conversion command is %s' % ' '.join(convert_command))
    return_val = subprocess.call(convert_command,
                                 stdout=ftk_output,
                                 stderr=subprocess.STDOUT)
    logger.debug('ftkimager return value is %s' % return_val)
    ftk_detail_output = '%s.txt' % e01_file.name

    e01_size = os.path.getsize(e01_file.name)
    if e01_size == 0:
        raise Exception('Generated E01 file is 0 size')

    logger.info('Generated E01 (%s) from %s AFF (%s)' % \
        (filesizeformat(e01_size), original.pid, filesizeformat(aff_size)))

    # use ftkimager to verify aff and e01 and compare checksums
    aff_checksums = ftkimager_verify(aff_file.name)
    if not aff_checksums:
        raise Exception('Error running ftkimager verify on AFF for %s' %
                        original.pid)
    e01_checksums = ftkimager_verify(e01_file.name)
    if not e01_checksums:
        raise Exception('Error running ftkimager verify on E01 for %s' %
                        original.pid)

    logger.debug('AFF verify checksums: %s' % \
        ', '.join('%s: %s' % (k, v) for k, v in aff_checksums.iteritems()))
    logger.debug('E01 verify checksums: %s' % \
        ', '.join('%s: %s' % (k, v) for k, v in e01_checksums.iteritems()))
    if aff_checksums != e01_checksums:
        raise Exception('AFF and E01 ftkimager verify checksums do not match')

    # create a new diskimage object from the file
    # - calculate file uri for content location
    e01_file_uri = fedora_file_uri(e01_file.name)
    logger.debug('E01 fedora file URI is %s', e01_file_uri)

    # change permissions on tmpdir + files to ensure fedora can access them
    os.chmod(tmpdir, 0775)
    os.chmod(e01_file.name, 0666)
    os.chmod(ftk_output.name, 0666)
    os.chmod(ftk_detail_output, 0666)

    migrated = DiskImage.init_from_file(e01_file.name,
                                        initial_label=original.label,
                                        content_location=e01_file_uri)

    # add ftkimager text output & details as supplemental files
    # - console output captured from subprocess call
    dsobj = migrated.getDatastreamObject('supplement0',
                                         dsobj_type=FileDatastreamObject)
    dsobj.label = 'ftkimager_output.txt'
    dsobj.mimetype = 'text/plain'
    dsobj.checksum = md5sum(ftk_output.name)
    logger.debug('Adding ftkimager console output as supplemental dastream %s label=%s mimetype=%s checksum=%s' % \
                (dsobj.id, dsobj.label, dsobj.mimetype, dsobj.checksum))
    dsobj.content = open(ftk_output.name).read()
    # - text file generated by ftkimager alongside the E01
    dsobj2 = migrated.getDatastreamObject('supplement1',
                                          dsobj_type=FileDatastreamObject)
    dsobj2.label = 'ftkimager_summary.txt'
    dsobj2.mimetype = 'text/plain'
    dsobj2.checksum = md5sum(ftk_detail_output)
    logger.debug('Adding ftkimager summary as supplemental dastream %s label=%s mimetype=%s checksum=%s' % \
                (dsobj2.id, dsobj2.label, dsobj2.mimetype, dsobj2.checksum))
    dsobj2.content = open(ftk_detail_output).read()

    # set metadata based on original disk image
    # - associate with original
    migrated.original = original
    # copy over descriptive & rights metadata
    # - collection membership
    migrated.collection = original.collection
    # - mods title, covering dates, abstract
    migrated.mods.content.title = original.mods.content.title
    migrated.mods.content.abstract = original.mods.content.abstract
    migrated.mods.content.coveringdate_start = original.mods.content.coveringdate_start
    migrated.mods.content.coveringdate_end = original.mods.content.coveringdate_end
    # - entire rights datastream
    migrated.rights.content = original.rights.content

    ### Update generated premis to describe migration.
    premis_ds = migrated.provenance.content
    premis_ds.object.composition_level = 0
    # these values are the same for all migrated AFFs
    premis_ds.object.create_creating_application()
    premis_ds.object.creating_application.name = creating_application
    premis_ds.object.creating_application.version = application_version
    premis_ds.object.creating_application.date = date.today()

    # add relationship to the original object
    rel = PremisRelationship(type='derivation')
    rel.subtype = 'has source'
    rel.related_object_type = 'ark'
    rel.related_object_id = original.mods.content.ark
    # relationship must also reference the migration event on the
    # original, which doesn't exist yet.  Generate a migration event
    # id now to use for both
    migration_event_id = uuid.uuid1()
    rel.related_event_type = 'UUID'
    rel.related_event_id = migration_event_id
    premis_ds.object.relationships.append(rel)

    ## NOTE: Due to a Fedora bug with checksums and file uri ingest,
    ## content datastream checksum must be cleared out before ingest
    ## and manually checked after.

    # store datastream checksum that would be sent to fedora
    e01_checksum = migrated.content.checksum
    # clear it out so Fedora can ingest without erroring
    migrated.content.checksum = None

    # ingest
    try:
        migrated.save('Ingest migrated version of %s' % original.pid)
        logger.debug('Migrated object ingested as %s' % migrated.pid)
    except DuplicateContent as err:
        raise Exception('Duplicate content detected for %s: %s %s',
                        original.pid, err, ', '.join(err.pids))
    # would probably be good to catch other fedora errors

    # remove temporary files
    for tmpfilename in [
            aff_file.name, e01_file.name, ftk_output.name, ftk_detail_output
    ]:
        os.remove(tmpfilename)

    # reinitialize migrated object, just to avoid any issues
    # with accessing ark uri for use in original object premis
    migrated = repo.get_object(migrated.pid, type=DiskImage)
    # verify checksum
    if migrated.content.checksum != e01_checksum:
        raise Exception('Checksum mismatch detected on E01 for %s',
                        migrated.pid)

    # once migrated object has been ingested,
    # update original object with migration information
    # - add rels-ext reference to migrated object
    original.migrated = migrated
    # - update premis with migration event and relationship
    migration_event = PremisEvent()
    migration_event.id_type = 'UUID'
    migration_event.id = migration_event_id
    migration_event.type = 'migration'
    migration_event.date = datetime.now().isoformat()
    migration_event.detail = migration_event_detail
    migration_event.outcome = 'Pass'
    migration_event.outcome_detail = migration_event_outcome
    migration_event.agent_type = 'fedora user'
    migration_event.agent_id = repo.username
    # premis wants both source and outcome objects linked in the event
    link_source = PremisLinkingObject(id_type='ark')
    link_source.id = original.mods.content.ark
    link_source.role = 'source'
    link_outcome = PremisLinkingObject(id_type='ark')
    link_outcome.id = migrated.mods.content.ark
    link_outcome.role = 'outcome'
    migration_event.linked_objects.extend([link_source, link_outcome])
    original.provenance.content.events.append(migration_event)
    # add relation to migrated object in to premis object
    rel = PremisRelationship(type='derivation')
    rel.subtype = 'is source of'
    rel.related_object_type = 'ark'
    rel.related_object_id = migrated.mods.content.ark
    rel.related_event_type = 'UUID'
    rel.related_event_id = migration_event.id
    original.provenance.content.object.relationships.append(rel)
    original.save()
    logger.debug('Original disk image updated with migration data')

    # remove aff migration temp dir and any remaining contents
    try:
        shutil.rmtree(tmpdir)
    except OSError:
        # tempdir removal could fail due to nfs files
        # wait a few seconds and try again
        time.sleep(3)
        try:
            shutil.rmtree(tmpdir)
        except OSError as os_err:
            logger.warning('Failed to remove tmpdir %s : %s', tmpdir, os_err)

    logger.info('Migrated %s AFF to %s E01' % (original.pid, migrated.pid))
    return 'Migrated %s to %s' % (original.pid, migrated.pid)
Beispiel #51
0
def ingest_files(files, collection, comment, request):
    '''Ingest a dictionary of files as returned by
    :meth:`keep.files.forms.UploadForm.files_to_ingest`.
    Returns a dictionary reporting per-file ingest success or failure.

    :param files: dictionary of files to be ingested
    :param collection: :class:`~keep.collection.models.CollectionObject` that
        newly ingested objects should be associated with
    :param comment: save message for fedora ingest
    :param request: :class:`~django.http.HttpRequest`, to access Fedora and
        ingest new objects as the logged-in user.
    '''

    # NOTE: using this structure for easy of display in django templates (e.g., regroup)
    results = []

    m = magic.Magic(mime=True)
    for filename, label in files.iteritems():

        file_info = {'label': label}

        # check if file is an allowed type

        # NOTE: for single-file upload, browser-set type is
        # available as UploadedFile.content_type - but since
        # browser mimetypes are unreliable, calculate anyway
        try:
            type = m.from_file(filename)
        except IOError:
            raise Exception(
                'Uploaded file is no longer available for ingest; please try again.'
            )

        type, separator, options = type.partition(';')
        if type not in allowed_upload_types(request.user):
            # store error for display on detailed result page
            file_info.update({
                'success':
                False,
                'message':
                '''File type '%s' is not allowed''' % type
            })
            # if not an allowed type, no further processing
            results.append(file_info)
            continue

        if collection is None:
            file_info.update({
                'success': False,
                'message': '''Collection not selected'''
            })
            results.append(file_info)
            continue

        # if there is an MD5 file (i.e., file was uploaded via ajax),
        # use the contents of that file as checksum
        if os.path.exists(filename + '.md5'):
            with open(filename + '.md5') as md5file:
                md5 = md5file.read()
        # otherwise, calculate the MD5 (single-file upload)
        else:
            md5 = md5sum(filename)

        # determine what type of object to initialize based on mimetype
        objtype = None
        for t in uploadable_objects:
            if type in t.allowed_mimetypes:
                objtype = t
                break

        # initialize a new object from the file
        obj = objtype.init_from_file(filename,
                                     initial_label=label,
                                     request=request,
                                     checksum=md5,
                                     mimetype=type)

        # set collection on ingest
        obj.collection = collection

        try:
            # NOTE: by sending a log message, we force Fedora to store an
            # audit trail entry for object creation, which doesn't happen otherwise
            obj.save(comment)
            file_info.update({
                'success': True,
                'pid': obj.pid,
                'url': obj.get_absolute_url(),
                'checksum': md5
            })

            # if audio, needs an additional step:
            if objtype == AudioObject:
                # Start asynchronous task to convert audio for access
                # NOTE: not passing in user-upload file so that
                # celery can more easily be run on a separate server
                queue_access_copy(obj)
                # remove the file now that we have sucessfully ingested
                os.remove(filename)

            # NOTE: could remove MD5 file (if any) here, but MD5 files
            # should be small and will get cleaned up by the cron script

        # special case: detected as duplicate content
        except DuplicateContent as e:
            # mark as failed and generate message with links to records
            links = []
            repo = Repository(request=request)
            for pid in e.pids:
                # use fedora type-inferring logic with list of content models
                # pulled from solr results
                obj = repo.get_object(pid,
                                      type=repo.best_subtype_for_object(
                                          pid, e.pid_cmodels[pid]))
                # use appropriate object class to get the object url
                links.append('<a href="%s">%s</a>' %
                             (obj.get_absolute_url(), pid))

            msg = mark_safe('%s: %s' % (unicode(e), '; '.join(links)))
            file_info.update({'success': False, 'message': msg})

        except Exception as e:
            logger.error('Error ingesting %s: %s' % (filename, e))
            logger.debug("Error details:\n" + traceback.format_exc())
            file_info['success'] = False

            # check for Fedora-specific errors
            if isinstance(e, RequestFailed):
                if 'Checksum Mismatch' in e.detail:
                    file_info['message'] = 'Ingest failed due to a checksum mismatch - ' + \
                        'file may have been corrupted or incompletely uploaded to Fedora'
                else:
                    file_info['message'] = 'Fedora error: ' + unicode(e)

            # non-fedora error
            else:
                file_info['message'] = 'Ingest failed: ' + unicode(e)

        finally:
            # no matter what happened, store results for reporting to user
            results.append(file_info)

    return results
Beispiel #52
0
class ArrangementObjectTest(KeepTestCase):

    def setUp(self):
        self.repo = Repository()
        self.pids = []

        # create test collection
        coll = self.repo.get_object(type=CollectionObject)
        coll.pid = '%s:parent-1' % settings.FEDORA_PIDSPACE
        coll.mods.content.source_id = '12345'
        coll.save()
        self.pids.append(coll.pid)

        #create test arrangement object
        self.arr = self.repo.get_object(type=ArrangementObject)
        self.arr.pid = 'foo:1'
        self.arr.collection = coll

    def tearDown(self):
        for pid in self.pids:
            self.repo.purge_object(pid)

    @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface)
    def test_by_arrangement_id(self, mocksolr):
        # no match
        self.assertRaises(ObjectDoesNotExist, ArrangementObject.by_arrangement_id,
                          42)
        solr = mocksolr.return_value
        solr.query.assert_called_with(arrangement_id=42,
                                      content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL)
        solr.query.return_value.field_limit.assert_called_with('pid')

        # too many matches
        solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'},
                                                            {'pid': 'pid:2'}]
        self.assertRaises(MultipleObjectsReturned, ArrangementObject.by_arrangement_id,
                          42)

        # one match
        solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}]
        ao = ArrangementObject.by_arrangement_id(42)
        self.assert_(isinstance(ao, ArrangementObject))

        # custom repo object
        mockrepo = Mock()
        ao = ArrangementObject.by_arrangement_id(42, mockrepo)
        mockrepo.get_object.assert_called_with('pid:1', type=ArrangementObject)

    def test_arrangement_status(self):
        obj = ArrangementObject(Mock())
        obj.arrangement_status = 'processed'
        self.assertEqual('A', obj.state)
        self.assertEqual('processed', obj.arrangement_status)

        obj.arrangement_status = 'accessioned'
        self.assertEqual('I', obj.state)
        self.assertEqual('accessioned', obj.arrangement_status)

        value_error = None
        try:
            obj.arrangement_status = 'bogus'
        except ValueError:
            value_error = True

        self.assertTrue(value_error,
                        'attempting to assign an unknown status should raise a ValueError')

    def test_update_access_cmodel(self):
        obj = ArrangementObject(Mock())
        # no status set - should be set to restricted
        obj._update_access_cmodel()

        self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_RESTRICTED_CMODEL))
                     in obj.rels_ext.content)
        self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_ALLOWED_CMODEL))
                     not in obj.rels_ext.content)

        # set to status code 2 = access allowed
        obj.rights.content.create_access_status()
        obj.rights.content.access_status.code = '2'

        obj._update_access_cmodel()

        self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_RESTRICTED_CMODEL))
                     not in obj.rels_ext.content)
        self.assert_((obj.uriref, modelns.hasModel, URIRef(ACCESS_ALLOWED_CMODEL))
                     in obj.rels_ext.content)

    def test_index_data(self):
        idx_data = self.arr.index_data()
        self.assertEqual('born-digital', idx_data['object_type'])
        self.assertEqual(self.arr.pid, idx_data['pid'])
        self.assertIn(self.arr.owner, idx_data['owner'])
        self.assertEquals(self.arr.collection.pid, idx_data['collection_id'])
        self.assertEquals(self.arr.collection.mods.content.source_id, idx_data['collection_source_id'])

    # Test the update_ark_label method in the keep.common.fedora
    # Note that this test is a simplified version of keep.common.fedora:ArkPidDigitalObject.test_update_ark_label
    # The udpate_ark_label here is an overriden method that is more specifc, and is used on Arrangement objects
    @patch('keep.arrangement.models.pidman')  # mock the pidman client (the API service)
    def test_update_ark_label(self, mockpidman):

        # Create a ArrangementObject
        arrangement_object = ArrangementObject(Mock())

        # Set a pid on the object so that it could internally generate a noid etc.
        arrangement_object.pid = "test:1234"

        # Simulate when the object doesn't exist (or hasn't been saved)
        # By default it appears as if it doesn't exist
        arrangement_object.update_ark_label()

        # What we should expect is that the update_ark_label is not called on pidman
        # Also there shouldn't be any errors
        # Use the mock assertFalse to check if a method is called or not
        self.assertFalse(mockpidman.get_ark.called)

        # Mock when the object exists (returns True)
        # Note: Need to set the Mock on the class and not the object because
        # this (exists) is a property method
        with patch.object(ArrangementObject, 'exists', new=Mock(return_value=True)):
            arrangement_object.update_ark_label()
            self.assertFalse(mockpidman.get_ark.called)

        # Set the label before the object exists so we don't trigger API calls
        arrangement_object.dc.content.title = "testpid"
        with patch.object(ArrangementObject, 'exists', new=Mock(return_value=True)):
            mockpidman.get_ark.return_value = {"name": arrangement_object.dc.content.title}
            arrangement_object.update_ark_label()
            mockpidman.get_ark.assert_called_with(arrangement_object.noid) # assert that it is called with a noid too
            self.assertFalse(mockpidman.update_ark.called)

            # When the label is different from that in Pidman
            mockpidman.get_ark.return_value = {"name": "another pid"}
            arrangement_object.update_ark_label()
            mockpidman.get_ark.assert_called_with(arrangement_object.noid) # assert that it is called with a noid too
            mockpidman.update_ark.assert_called_with(noid=arrangement_object.noid, name=arrangement_object.dc.content.title)

    def test_set_premis_object(self):
        mockapi = Mock()
        arrangement_object = ArrangementObject(mockapi)
        arrangement_object.pid = "test:1234"
        arrangement_object.mods.content.ark = 'ark:/1234/987'

        # return empty iterator for original data to checksum
        mockapi.getDatastreamDissemination.return_value = []
        with patch.object(arrangement_object, 'getDatastreamObject') as mockgetds:
            mockgetds.return_value.checksum = '123456789'
            mockgetds.return_value.mimetype = 'text/plain'
            arrangement_object.set_premis_object()

        self.assert_(arrangement_object.provenance.content.object)
        premis = arrangement_object.provenance.content
        # FIXME: placeholder tests for placeholder functionality,
        # should be updated to use ARK uri once that is implemented
        self.assertEqual('ark', premis.object.id_type)
        self.assertEqual(arrangement_object.mods.content.ark, premis.object.id)
        self.assertEqual('p:file', premis.object.type)
        self.assertEqual(0, premis.object.composition_level)
        self.assertEqual('MD5', premis.object.checksums[0].algorithm)
        self.assertEqual('123456789',
                         premis.object.checksums[0].digest)
        # sha1 for an empty file
        empty_sha1 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'
        self.assertEqual('SHA-1', premis.object.checksums[1].algorithm)
        self.assertEqual(empty_sha1,
                         premis.object.checksums[1].digest)
        # object format should be original mietype
        self.assertEqual('text/plain', premis.object.format.name)

        # generated premis should be valid
        self.assertTrue(premis.is_valid())

    def test_identifier_change_event(self):
        mockapi = Mock()
        mockapi.username = '******'
        arrangement_object = ArrangementObject(mockapi)
        arrangement_object.pid = 'test:1234'
        arrangement_object.mods.content.ark = 'ark:/1234/987'

        # set object premis so we can validate
        mockapi.getDatastreamDissemination.return_value = []
        with patch.object(arrangement_object, 'getDatastreamObject') as mockgetds:
            mockgetds.return_value.checksum = '123456789'
            mockgetds.return_value.mimetype = 'text/plain'
            arrangement_object.set_premis_object()

        arrangement_object.identifier_change_event('old-pid:1')
        premis = arrangement_object.provenance.content
        self.assertEqual(1, len(premis.events))
        event = premis.events[0]
        self.assertEqual('UUID', event.id_type)
        # id should be set, we don't care what it is exactly
        self.assert_(event.id)
        self.assertEqual('identifier assignment', event.type)
        self.assertEqual('program="keep"; version="%s"' % __version__,
                         event.detail)
        self.assertEqual('Pass', event.outcome)
        msg = 'Persistent identifier reassigned from %s to %s' % \
            ('old-pid:1', arrangement_object.pid)
        self.assertEqual(msg, event.outcome_detail)
        self.assertEqual('fedora user', event.agent_type)
        self.assertEqual('fedoraAdmin', event.agent_id)

        # generated premis should be valid
        self.assertTrue(premis.is_valid())
Beispiel #53
0
 def get_numbering(self, pid):
     if pid in settings.PID_ALIASES:
         pid = settings.PID_ALIASES[pid]
     repo = Repository()
     return repo.get_object(pid, type=CollectionObject)
Beispiel #54
0
class EmailMessageTest(KeepTestCase):

    def setUp(self):
        self.repo = Repository()
        self.pids = []

        # test EmailMessage
        self.email = self.repo.get_object(type=EmailMessage)
        self.email.cerp.content.from_list = ['*****@*****.**']
        self.email.cerp.content.to_list = ['*****@*****.**']
        self.email.cerp.content.subject_list = ['Interesting Subject']

    def tearDown(self):
        for pid in self.pids:
            self.repo.purge_object(pid)

    def test_headers(self):
        h1 = cerp.Header()
        h1.name = "HEADER 1"
        h1.value = "value for header 1"
        h2 = cerp.Header()
        h2.name = "HEADER 2"
        h2.value = "value for header 2"
        self.email.cerp.content.headers.append(h1)
        self.email.cerp.content.headers.append(h2)
        self.assertEqual(self.email.headers['HEADER 1'], 'value for header 1')
        self.assertEqual(self.email.headers['HEADER 2'], 'value for header 2')


    def test_email_label(self):
        # no object label and one person in to field
        label = self.email.email_label()
        self.assertEqual('Email from [email protected] to [email protected] Interesting Subject',
                         label,
                         'Should construct label when it does not exist')

        # more then one person in to list
        self.email.cerp.content.to_list.append('*****@*****.**')
        label = self.email.email_label()
        self.assertEqual('Email from [email protected] to [email protected] et al. Interesting Subject',
                         label,
                         'only show first to email address when there are more than one')

        # no subject
        self.email.cerp.content.subject_list = []
        self.assertEqual('Email from [email protected] to [email protected] et al.',
                         self.email.email_label(),
                         'Display message without subject when no subject is present')

        # has a date
        date_header = cerp.Header()
        date_header.name = 'Date'
        date_header.value = 'Friday 13 200 13:00'
        self.email.cerp.content.headers.append(date_header)
        label = self.email.email_label()
        self.assertEqual('Email from [email protected] to [email protected] et al. on Friday 13 200 13:00',
                         label,
                         'only show first to email address when there are more than one')

        # object label already exists
        self.email.label = "label we want to keep"
        label = self.email.email_label()
        self.assertEqual(self.email.label, label, 'label should be preserved when it exists')

    def test_index_data(self):
        # NOTE: logic for creating the label is in the label test

        # test to make sure label exists in index data
        data = self.email.index_data()
        self.assertIn('label', data.keys())
        # mime_data does not exist, so no c
        self.assert_('content_md5' not in data,
                     'content_md5 should not be set when mime data does not exist')

        # patch mime data to test exists /cchecksum
        with patch.object(self.email, 'mime_data', Mock()) as mock_mime:
            mock_mime.exists = True
            mock_mime.checksum = 'test checksum value'

            data = self.email.index_data()
            self.assertEqual(self.email.mime_data.checksum, data['content_md5'])

    @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface)
    def test_by_checksum(self, mocksolr):
        # no match
        self.assertRaises(ObjectDoesNotExist, EmailMessage.by_checksum,
                          42)
        solr = mocksolr.return_value
        solr.query.assert_called_with(content_md5=42,
                                      content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL)
        solr.query.return_value.field_limit.assert_called_with('pid')

        # too many matches
        solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'},
                                                            {'pid': 'pid:2'}]
        self.assertRaises(MultipleObjectsReturned, EmailMessage.by_checksum,
                          42)

        # one match
        solr.query.return_value.field_limit.return_value = [{'pid': 'pid:1'}]
        em = EmailMessage.by_checksum(42)
        self.assert_(isinstance(em, EmailMessage))

        # custom repo object
        mockrepo = Mock()
        em = EmailMessage.by_checksum(42, mockrepo)
        mockrepo.get_object.assert_called_with('pid:1', type=EmailMessage)

    @patch('keep.arrangement.models.solr_interface', spec=sunburnt.SolrInterface)
    def test_by_message_id(self, mocksolr):
        # no match
        self.assertRaises(ObjectDoesNotExist, EmailMessage.by_message_id,
                          '<*****@*****.**>')
        solr = mocksolr.return_value
        solr.query.assert_called_with(arrangement_id='<*****@*****.**>',
                                      content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL)
        solr.query.return_value.field_limit.assert_called_with('pid')
Beispiel #55
0
def check_wav_mp3_duration(obj_pid=None,
                           wav_file_path=None,
                           mp3_file_path=None):
    '''Compare the durations of a wav file with an mp3 file (presumably an mp3
    generated from the wav via :meth:`keep.audio.tasks.convert_wav_to_mp3` )
    to check that they are roughly the same length.

    :param obj_pid: The pid of a fedora object (expected to be an
        AudioObject) to get the wav and/or mp3 files from if they are
        not specified by path.
    :param wav_file_path: Path to the wav_file to use for comparison;
        if not specified, it will be downloaded from the object in
        Fedora.
    :param mp3_file_path: Path to the mp3_file to use for comparison;
        if not specified, it will be downloaded from the object in
        Fedora.  Note that this file must end in .mp3 for the duration
        to be calculated.

    :returns: True if the two files have the same duration, or close
        enough duration (no more than 1 second difference)
    '''
    try:
        #Initialize temporary files to None.
        tmp_wav_path = None
        tmp_mp3_path = None

        #Initialize connection to the repository:
        repo = Repository()

        #Using the ingest directory to simplify cleanup in case extra files hang around.
        tempdir = settings.INGEST_STAGING_TEMP_DIR
        if not os.path.exists(tempdir):
            os.makedirs(tempdir)

        #If no wav file is specified, use the object.
        if wav_file_path is None:
            #Load the object.
            obj = repo.get_object(obj_pid, type=AudioObject)
            # download the compressed audio file from the object in fedora
            # mkstemp returns file descriptor and full path to the temp file
            tmp_fd_wav, tmp_wav_path = tempfile.mkstemp(dir=tempdir,
                                                        suffix=".mp3")
            try:
                destination = os.fdopen(tmp_fd_wav, 'wb+')
            except Exception:
                os.close(tmp_fd_wav)
                raise

            try:
                destination.write(obj.audio.content.read())
            except Exception:
                raise
            finally:
                # NOTE: This automatically closes the open tmpfd via Python magic;
                # calling os.close(tmpfd) at this point will error.
                destination.close()
        #Else use the passed in wav file.
        else:
            tmp_wav_path = wav_file_path

        #If no mp3 file is specified, use the object.
        if mp3_file_path is None:
            #Load the object.
            obj = repo.get_object(obj_pid, type=AudioObject)
            #Verify the compressed datastream exists, if not, return false as cannot match.
            if (not obj.compressed_audio.exists):
                return False

            # download the master audio file from the object in fedora
            # mkstemp returns file descriptor and full path to the temp file
            tmp_fd_mp3, tmp_mp3_path = tempfile.mkstemp(dir=tempdir,
                                                        suffix=".mp3")
            try:
                destination = os.fdopen(tmp_fd_mp3, 'wb+')
            except Exception:
                os.close(tmp_fd_mp3)
                raise

            try:
                destination.write(obj.compressed_audio.content.read())
            # just pass any exceptions up the chain
            finally:
                # NOTE: This automatically closes the open tmpfd via Python magic;
                # calling os.close(tmpfd) at this point will error.
                destination.close()
        #Else use the passed in wav file.
        else:
            tmp_mp3_path = mp3_file_path

        #Get information on the mp3 file using mutagen:
        mp3_tags = mutagen.File(tmp_mp3_path)
        if mp3_tags is None:
            raise Exception(
                'Could not get MP3 tag information for MP3 file %s' %
                tmp_mp3_path)

        mp3_length = mp3_tags.info.length
        wav_length = wav_duration(tmp_wav_path)

        # Verify the wav file and the mp3 file have the same duration,
        # within the configured allowed discrepancy
        # - use a default value so this doesn't fail when not configured
        allowed_discrepancy = getattr(settings,
                                      'AUDIO_ALLOWED_DURATION_DISCREPANCY',
                                      1.0)
        return (math.fabs(mp3_length - wav_length) < allowed_discrepancy)
    except Exception:
        raise
    #Cleanup for everything.
    finally:
        # Only remove wav if file was not passed in (ie. only remove the temporary file).
        if wav_file_path is None and tmp_wav_path is not None:
            if os.path.exists(tmp_wav_path):
                os.remove(tmp_wav_path)

        # Only remove mp3 if file was not passed in (ie. only remove the temporary file).
        if mp3_file_path is None and tmp_mp3_path is not None:
            if os.path.exists(tmp_mp3_path):
                os.remove(tmp_mp3_path)