Ejemplo n.º 1
    def archive_choices_by_user(self):
        # this method shouldn't be set if user isn't defined, but just in case
        if not self.user:
            return archive_alias_choices()

        # NOTE: should be possible to query for archives directly,
        # but filtering on audio items requires two levels of joins,
        # and it's unclear how that actually works

        # use collection facet query to get list of archives
        q = CollectionObject.item_collection_query()
        q = q.facet_by('archive_id', sort='count', mincount=1) \

        # - depending on permissions, restrict to collections with researcher audio
        if not self.user.has_perm('collection.view_collection') and \
            q = q.join('collection_id', 'pid', researcher_access=True)
            q = q.join('collection_id', 'pid', has_access_copy=True)

        # make a list of user-viewable archive pids
        archives = [pid for pid, count in q.execute().facet_counts.facet_fields['archive_id']]

        choices = []
        # we need pid aliases keyed on pid for lookup
        pid_aliases_by_pid = dict([(v, k) for k, v in settings.PID_ALIASES.iteritems()])
        for a in archives:
            if a in pid_aliases_by_pid:
                alias = pid_aliases_by_pid[a]
                # use the alias for *both* display and submit value
                choices.append((alias, alias.upper()))
        choices.insert(0, ('', '---'))   # blank option at the beginning (default)

        return choices
Ejemplo n.º 2
    def search_info(self):
        '''Generate a dictionary of search field and terms in a format
        that can be displayed to a user on the search results page.'''

        # don't do anything if the form isn't valid
        if not self.is_valid():

        search_info = {}
        for field, val in self.cleaned_data.iteritems():
            if field in self.display_output_fields:
                # do not show display-formatting field values with search terms

            key = self.fields[
                field].label  # use form display label when available
            if key is None:  # if field label is not set, use field name as a fall-back
                key = field
            if val:  # if search value is not empty, selectively add it
                # for collections get collection object info
                if field == 'collection':
                    search_info[key] = CollectionObject.find_by_pid(val)
                elif field == 'access_code':  # for rights, numeric code + abbreviation
                    search_info[key] = '%s - %s' % (
                        val, rights_access_terms_dict[val].abbreviation)
                elif field == "content_model":
                    search_info[key] = dict(self.format_options)[val]
                elif field == "simpleCollection":
                    search_info[key] = SimpleCollection.find_by_pid(val)
                elif val != self.fields[field].initial:  # ignore default values
                    search_info[key] = val

        return search_info
Ejemplo n.º 3
def search(request):
    '''Search for :class:`~keep.collection.models.CollectionObject`
    form = CollectionSearch(request.GET, prefix='collection')
    context = {'search': form}
    if form.is_valid():
        # include all non-blank fields from the form as search terms
        search_opts = dict((key, val)
                           for key, val in form.cleaned_data.iteritems()
                           if val is not None and val != '')  # but need to search by 0
        # restrict to currently configured pidspace and collection content model
            'pid': '%s:*' % settings.FEDORA_PIDSPACE,
            'content_model': CollectionObject.COLLECTION_CONTENT_MODEL,

        # collect non-empty, non-default search terms to display to user on results page
        search_info = {}
        for field, val in form.cleaned_data.iteritems():
            key = form.fields[field].label  # use form display label
            if key is None:     # if field label is not set, use field name as a fall-back
                key = field

            if val is not None and val != '':     # if search value is not empty, selectively add it
                if hasattr(val, 'lstrip'):  # solr strings can't start with wildcards
                    extra_solr_cleaned = val.lstrip('*?')
                    if val != extra_solr_cleaned:
                        if not extra_solr_cleaned:
                            messages.info(request, 'Ignoring search term "%s": Text fields can\'t start with wildcards.' % (val,))
                            del search_opts[field]
                        messages.info(request, 'Searching for "%s" instead of "%s": Text fields can\'t start with wildcards.' %
                                      (extra_solr_cleaned, val))
                        val = extra_solr_cleaned
                        search_opts[field] = val

                if field == 'archive_id':       # for archive, get  info
                    search_info[key] = CollectionObject.find_by_pid(val)
                elif val != form.fields[field].initial:     # ignore default values
                    search_info[key] = val
        context['search_info'] = search_info

        solr = solr_interface()
        solrquery = solr.query(**search_opts).sort_by('source_id')
        # TODO: eventually, we'll need proper pagination here;
        # for now, set a large max to return everything
        context['results'] = solrquery.paginate(start=0, rows=1000).execute()

    # if the form was not valid, set the current instance of the form
    # as the sidebar form instance to display the error
        context['collection_search'] = form

    # render search results page; if there was an error, results will be displayed as empty
    return TemplateResponse(request, 'collection/search.html', context)
Ejemplo n.º 4
def archive_alias_choices():
    choices = []
    # we need pid aliases keyed on pid for lookup
    pid_aliases_by_pid = dict([(v, k) for k, v in settings.PID_ALIASES.iteritems()])
    for a in CollectionObject.archives(format=dict):
        if a['pid'] in pid_aliases_by_pid:
            alias = pid_aliases_by_pid[a['pid']]
            # use the alias for *both* display and submit value
            choices.append((alias, alias.upper()))
    choices.insert(0, ('', '---'))   # blank option at the beginning (default)
    return choices
Ejemplo n.º 5
def create_from_findingaid(request):
    form = FindCollection(request.POST)
    if not form.is_valid():
        messages.error(request, 'Form is not valid; please try again.')
        data = form.cleaned_data
        q = CollectionObject.item_collection_query()
        # submitted value is pid alias; lookup pid for solr query
        archive_id = settings.PID_ALIASES[data['archive']]
        q = q.query(archive_id=archive_id,
        # if collection is found, redirect to collection view with message
        if q.count():
            messages.info(request, 'Found %d collection%s for %s %s.' %
                          (q.count(), 's' if q.count() != 1 else '',
                           data['archive'].upper(), data['collection']))
            return HttpResponseSeeOtherRedirect(reverse('collection:view',
                kwargs={'pid': q[0]['pid']}))

            # otherwise, create the new record and redirect to new
            # collection edit page
            repo = Repository(request=request)
            coll_id = data['collection']
            coll = None
                archive = repo.get_object(archive_id, type=CollectionObject)
                fa = FindingAid.find_by_unitid(unicode(coll_id),
                coll = fa.generate_collection()
                coll.collection = archive
                messages.info(request, 'Added %s for collection %s: %s'
                              % (coll, coll_id, coll.mods.content.title))

                return HttpResponseSeeOtherRedirect(
                    reverse('collection:edit', kwargs={'pid': coll.pid}))

            except DoesNotExist:
                messages.error(request, 'No EAD found for %s in %s' %
                               (coll_id, data['archive'].upper()))
            except ReturnedMultiple:
                messages.error(request, 'Multiple EADs found for %s in %s' %
                               (coll_id, data['archive'].upper()))
            except RequestFailed as err:
                print err
                messages.error(request, 'Failed to save new collection')

    return HttpResponseSeeOtherRedirect(reverse('repo-admin:dashboard'))
Ejemplo n.º 6
    def handle(self, numbering_pid, *ids, **options):
        verbosity = int(options['verbosity'])

        numbering = self.get_numbering(numbering_pid)
        if not numbering.exists:
            raise CommandError('Numbering scheme %s not found' %
                               (numbering_pid, ))
        numbering_title = numbering.mods.content.title

        created = 0
        errors = 0

        for id in ids:
            # check for existing collection before creating new
            existing_coll = list(
                CollectionObject.find_by_collection_number(id, numbering.pid))
            if existing_coll:
                print 'Collection %s already exists as %s' % \
                      (id, ', '.join([coll.pid for coll in existing_coll]))

            coll = None
                fa = FindingAid.find_by_unitid(id, numbering_title)
                coll = fa.generate_collection()
                # new collection parent collection is the archive collection object
                coll.collection = numbering
                if not options['dryrun']:
                if verbosity:
                    print 'Added %s for collection %s: %s (from %s)' % (
                        coll, id, coll.mods.content.title, numbering_title)
                created += 1
            except DoesNotExist:
                print 'No EAD found for id %s in %s' % (id, numbering_title)
                errors += 1
            except ReturnedMultiple:
                print 'Multiple EADs found for id %s in %s' % (id,
                errors += 1
                if coll is not None:
                    print 'Failed to save %s for collection %s: %s (from %s)' % (
                        coll, id, coll.mods.content.title, numbering_title)

        if verbosity > 1:
            print '%d records created' % (created, )
            print '%d records failed' % (errors, )
Ejemplo n.º 7
    def library_choices_by_user(self):
        # this method shouldn't be set if user isn't defined, but just in case
        if not self.user:
            return archive_choices()

        # NOTE: should be possible to query for archives directly,
        # but filtering on audio items requires two levels of joins,
        # and it's unclear how that actually works

        # use collection facet query to get list of archives
        q = CollectionObject.item_collection_query()
        q = q.facet_by('archive_id', sort='count', mincount=1) \

        # - depending on permissions, restrict to collections with researcher content
        if not self.user.has_perm('collection.view_collection') and \
            q = q.join('collection_id', 'pid', researcher_access=True)
            q = q.join('collection_id', 'pid', has_access_copy=True)

        facets = q.execute().facet_counts.facet_fields

        solr = solr_interface()
        archive_info = dict([(pid.replace('info:fedora/', ''), {'count': count})
                        for pid, count in facets['archive_id']])

        # construct a boolean pid query to match any archive pids
        # in order to lookup titles and match them to pids
        pid_q = solr.Q()
        for pid in archive_info.keys():
            pid_q |= solr.Q(pid=pid)
        query = solr.query(pid_q) \
                    .field_limit(['pid', 'title']) \

        # ignore any spurious results that don't have titles (bad data in prod?)
        choices = [(a['pid'], a['title']) for a in query
                    if 'title' in a]
        choices.insert(0, ('', '---'))   # blank option at the beginning (default)
        return choices
Ejemplo n.º 8
    def decompress(self, pid):
        # break single field value (pid) into multi-value needed for
        # multi-value field

        if pid:
            # main (hidden) value is collection id; if set, get collection
            # information to display as pre-set value in the visible field
            coll = CollectionObject.find_by_pid(pid)
            if coll:
                # if source id is available, include in label
                if 'source_id' in coll:
                    label = '%(source_id)s %(title)s' % coll
                    label = coll['title']
                # fallback - should only happen if collection is not
                # indexed or pid is invalid
                logger.error('No collection information found for %s' % pid)
                label = '%s (title not found)' % pid

            return [pid, label]

        return [None, None]
Ejemplo n.º 9
    def index_data(self):
        '''Extend the default
        method to include additional fields specific to Keep
        Audio objects.'''
        # NOTE: we don't want to rely on other objects being indexed in Solr,
        # so index data should not use Solr to find any related object info

        # FIXME: is it worth splitting out descriptive index data here?
        data = super(AudioObject, self).index_data()
        data['object_type'] = 'audio'
        if self.collection and self.collection.exists:

            # collection_source_id  (0 is an allowable id, so check not None)
            if self.collection.mods.content.source_id is not None:
                    'collection_source_id'] = self.collection.mods.content.source_id

            # FIXME: previously indexing URI; is this needed for any reason or can we
            # use pid?  (needs to match collection index pid field for solr join)
            # data['collection_id'] = self.collection.uri
            data['collection_id'] = self.collection.pid
                # pull parent & archive collection objects directly from fedora
                parent = CollectionObject(self.api, self.collection.uri)
                data['collection_label'] = parent.label
                # NB: as of 2011-08-23, eulindexer doesn't support automatic
                # reindexing of audio objects when their collection changes.
                # as a result, archive_id and archive_label may be stale.
                # disable indexing them until eulindexer supports those
                # chained updates.
                #data['archive_id'] = parent.collection_id
                #archive = CollectionObject(self.api, parent.collection_id)
                #data['archive_label'] = archive.label
            except RequestFailed as rf:
                    'Error accessing collection or archive object in Fedora: %s'
                    % rf)

        # include resolvable ARK if available
        if self.mods.content.ark_uri:
            data['ark_uri'] = self.mods.content.ark_uri

        # old identifiers from previous digital masters
        dm1_ids = []
        if self.mods.content.dm1_id:
        if self.mods.content.dm1_other_id:
        if dm1_ids:
            data['dm1_id'] = dm1_ids

        # digitization purpose, if not empty
        if self.digitaltech.content.digitization_purpose_list:
            # convert nodelist to a normal list that can be serialized as json
            data['digitization_purpose'] = [
                dp for dp in self.digitaltech.content.digitization_purpose_list

        # related files
        if self.sourcetech.content.related_files_list:
            data['related_files'] = [
                rel for rel in self.sourcetech.content.related_files_list

        # part note
        if self.mods.content.part_note and self.mods.content.part_note.text:
            data['part'] = self.mods.content.part_note.text

        # sublocation
        if self.sourcetech.content.sublocation:
            data['sublocation'] = self.sourcetech.content.sublocation

        # rights access status code
        if self.rights.content.access_status:
            data['access_code'] = self.rights.content.access_status.code
        # copyright date from rights metadata
        if self.rights.content.copyright_date:
            data['copyright_date'] = self.rights.content.copyright_date
        # ip note from rights metadata
        if self.rights.content.ip_note:
            data['ip_note'] = self.rights.content.ip_note

        # boolean values that should always be available
            # should this item be accessible to researchers?
            bool(self.researcher_access),  # if None, we want False
            # flags to indicate which datastreams are available
            'has_access_copy': self.compressed_audio.exists,
            'has_original': self.audio.exists,

        if self.compressed_audio.exists:
                'access_copy_size': self.compressed_audio.size,
                'access_copy_mimetype': self.compressed_audio.mimetype,
        if self.digitaltech.content.duration:
            data['duration'] = self.digitaltech.content.duration

        if self.mods.content.origin_info and \
           self.mods.content.origin_info.issued \
                and not self.mods.content.origin_info.issued.is_empty():
            data['date_issued'] = [
                unicode(di) for di in self.mods.content.origin_info.issued
        if self.mods.content.origin_info and \
           self.mods.content.origin_info.created \
                and not self.mods.content.origin_info.created.is_empty():
            data['date_created'] = [
                unicode(di) for di in self.mods.content.origin_info.created

        if self.audio.exists:
            data['content_md5'] = self.audio.checksum

        return data
Ejemplo n.º 10
def browse_archive(request, archive):
    '''Browse a list of :class:`~keep.collection.models.CollectionObject`
    that belong to a specific archive.
    # if archive is set, lookup pid in settings.PID_ALIASES
    # then do a collection object query for all collections in that archive
    archive_pid = settings.PID_ALIASES.get(archive, None)
    # 404 for unknown archive pid alias
    if archive_pid is None:
        raise Http404
    # get archive object from fedora
    repo = Repository(request=request)
    archive_obj = repo.get_object(pid=archive_pid, type=CollectionObject)
    if not archive_obj.exists:
        raise Http404

    q = CollectionObject.item_collection_query()
    # restrict to collections in this archive, sort by collection number
    # FIXME: should this be pid instead of uri?
    q = q.query(archive_id=archive_pid).sort_by('source_id')

     # - depending on permissions, restrict to collections with researcher audio
    if not request.user.has_perm('collection.view_collection') and \
        q = q.join('collection_id', 'pid', researcher_access=True)
        q = q.join('collection_id', 'pid', has_access_copy=True)

    logger.debug('Solr query for collections in %s: %s' % \
                 (archive, unicode(q.query_obj)))

    # if no collections are found with current restraints and user
    # only has view_researcher_collection, forbid access to this page
    if not request.user.has_perm('collection.view_collection') and \
           request.user.has_perm('collection.view_researcher_collection') and \
           q.count() == 0:
       return prompt_login_or_403(request)

    # if a collection number is specified in url params, filter query
    collection_filter = None
    if request.GET.get('collection', None):
        collection_filter = request.GET['collection']
        q = q.query(source_id=collection_filter)

    # paginate the solr result set
    paginator = Paginator(q, 30)
        page = int(request.GET.get('page', '1'))
    except ValueError:
        page = 1
        collections = paginator.page(page)
    except (EmptyPage, InvalidPage):
        collections = paginator.page(paginator.num_pages)

    # url parameters for pagination links
    url_params = request.GET.copy()
    if 'page' in url_params:
        del url_params['page']

    # there are currently two dates in the index; for display,
    # we want single date or date range only (not fedora timestamp)
    date_re = re.compile('\d{4}(-\d{4})?$')
    for c in collections.object_list:
        c['collection_dates'] = []
        for d in c['date']:
            if date_re.match(d):

    return TemplateResponse(request, 'collection/browse.html',
        {'archive': archive_obj, 'collections': collections,
         'url_params': urlencode(url_params),
         'collection_filter': collection_filter,
         'find_collection': FindCollection(user=request.user)})
Ejemplo n.º 11
def list_archives(request, archive=None):
    '''List all top-level archive collections, with the total count of
    :class:`~keep.collection.models.CollectionObject` in each archive.

    .. Note::

       Archives must be configured in **PID_ALIASES** in Django settings
       in order to be listed here.

    .. Note::

       Within the code, top-level collections are referred to as "archives",
       but externally for users they should always be labeled as "Libraries."


    # if params are set, search for collection
    if 'archive' in request.GET and 'collection' in request.GET:
        form = FindCollection(request.GET, user=request.user)
        if form.is_valid():
            data = form.cleaned_data
            q = CollectionObject.item_collection_query()
            # submitted value is pid alias; lookup pid for solr query
            archive_id = settings.PID_ALIASES[data['archive']]
            q = q.query(archive_id=archive_id,
            # if exactly one result is found, redirect to the collection view
            if q.count() == 1:
                # give user some context for the redirect
                messages.info(request, 'One collection found for %s %s.' %
                              (data['archive'].upper(), data['collection']))
                return HttpResponseSeeOtherRedirect(reverse('collection:view',
                    kwargs={'pid': q[0]['pid']}))

            # otherwise, if multiple, redirect to a filtered view of the archive browse
            elif q.count():
                messages.info(request, '%d collections found for %s %s.' %
                    (q.count(), data['archive'].upper(), data['collection']))
                return HttpResponseSeeOtherRedirect('%s?%s' % \
                             kwargs={'archive': data['archive']}),
                    urlencode({'collection': data['collection']})))

            # if no matches, warn and return to archive display
                messages.warning(request, 'No collections found for %s %s.' %
                              (data['archive'].upper(), data['collection']))

        # values submitted but form not valid
            # TODO: better error message?
            messages.warning(request, 'Collection search input was not valid; please try again.')

    q = CollectionObject.item_collection_query()
    q = q.facet_by('archive_id', sort='count', mincount=1) \

     # - depending on permissions, restrict to collections with researcher audio
    if not request.user.has_perm('collection.view_collection') and \
        q = q.join('collection_id', 'pid', researcher_access=True)
        q = q.join('collection_id', 'pid', has_access_copy=True)

    facets = q.execute().facet_counts.facet_fields

    solr = solr_interface()
    archive_info = dict([(pid.replace('info:fedora/', ''), {'count': count})
                        for pid, count in facets['archive_id']])

    # construct a boolean pid query to match any archive pids
    # in order to lookup titles and match them to pids
    pid_q = solr.Q()
    for pid in archive_info.keys():
        pid_q |= solr.Q(pid=pid)
    query = solr.query(pid_q) \
                .field_limit(['pid', 'title']) \

    # pid aliases are keyed on the alias, but we need to look up by pid
    pid_aliases_by_pid = dict([(v, k) for k, v in settings.PID_ALIASES.iteritems()])

    # add solr information and pid aliases to info dictionary
    for q in query:
        pid = q['pid']
        if pid not in archive_info:
        # duplicate to make list of dict available to template for dictsort
        archive_info[pid]['pid'] = q['pid']
        archive_info[pid]['title'] = q['title']
        alias = pid_aliases_by_pid.get(pid, None)
        archive_info[pid]['alias'] = alias
        if alias is None:
            logger.warning('No pid alias found for archive %(pid)s (%(title)s)' \
                           % q)

    # prune any referenced archives that aren't actually indexed in solr
    # (should only happen in dev/qa)
    for pid in archive_info.keys():
        if 'title' not in archive_info[pid] or archive_info[pid]['alias'] is None:
            del archive_info[pid]

    # NOTE: sending list of values (dictionaries) to allow sorting in template

    return TemplateResponse(request, 'collection/archives.html',
        {'archives': archive_info.values(), 'find_collection': FindCollection(user=request.user)})
Ejemplo n.º 12
    def index_data(self):
        '''Extend the default
        method to include additional fields specific to Keep
        Video objects.'''
        # NOTE: we don't want to rely on other objects being indexed in Solr,
        # so index data should not use Solr to find any related object info

        data = super(Video, self).index_data()
        data['object_type'] = 'video'
        if self.collection and self.collection.exists:

            # collection_source_id  (0 is an allowable id, so check not None)
            if self.collection.mods.content.source_id is not None:
                    'collection_source_id'] = self.collection.mods.content.source_id
            data['collection_id'] = self.collection.pid
                # pull parent & archive collection objects directly from fedora
                parent = CollectionObject(self.api, self.collection.uri)
                data['collection_label'] = parent.label
            except RequestFailed as rf:
                    'Error accessing collection or archive object in Fedora: %s'
                    % rf)

        # include resolvable ARK if available
        if self.mods.content.ark_uri:
            data['ark_uri'] = self.mods.content.ark_uri

        #TODO May have to add these sections if more metada is added
        # # old identifiers from previous digital masters
        dm1_ids = []
        if self.mods.content.dm1_id:
        if self.mods.content.dm1_other_id:
        if dm1_ids:
            data['dm1_id'] = dm1_ids

        # digitization purpose, if not empty
        if self.digitaltech.content.digitization_purpose_list:
            # convert nodelist to a normal list that can be serialized as json
            data['digitization_purpose'] = [
                dp for dp in self.digitaltech.content.digitization_purpose_list

        # sublocation
        if self.sourcetech.content.sublocation:
            data['sublocation'] = self.sourcetech.content.sublocation

        # rights access status code
        if self.rights.content.access_status:
            data['access_code'] = self.rights.content.access_status.code
        # copyright date from rights metadata
        if self.rights.content.copyright_date:
            data['copyright_date'] = self.rights.content.copyright_date
        # ip note from rights metadata
        if self.rights.content.ip_note:
            data['ip_note'] = self.rights.content.ip_note
        # # boolean values that should always be available
            # should this item be accessible to researchers?
            'researcher_access': bool(self.researcher_access),
            # flags to indicate which datastreams are available
            'has_access_copy': self.access_copy.exists,
            'has_original': self.content.exists,

        if self.access_copy.exists:
                'access_copy_size': self.access_copy.info.size,
                'access_copy_mimetype': self.access_copy.mimetype,

        if self.digitaltech.content.duration:
            data['duration'] = self.digitaltech.content.duration

        if self.mods.content.origin_info and \
           self.mods.content.origin_info.issued \
                and not self.mods.content.origin_info.issued.is_empty():
            data['date_issued'] = [
                unicode(di) for di in self.mods.content.origin_info.issued
        if self.mods.content.origin_info and \
           self.mods.content.origin_info.created \
                and not self.mods.content.origin_info.created.is_empty():
            data['date_created'] = [
                unicode(di) for di in self.mods.content.origin_info.created

        # store master video format and size
        if self.provenance.content.object and self.provenance.content.object.format:
            data['content_format'] = self.provenance.content.object.format.name
        data['content_size'] = self.content.size

        return data
Ejemplo n.º 13
def archive_choices():
    choices = [(a['pid'],
                a['title']) for a in CollectionObject.archives(format=dict)]
    choices.insert(0, ('', ''))   # blank option at the beginning (default)
    return choices