def archive_choices_by_user(self): # this method shouldn't be set if user isn't defined, but just in case if not self.user: return archive_alias_choices() # NOTE: should be possible to query for archives directly, # but filtering on audio items requires two levels of joins, # and it's unclear how that actually works # use collection facet query to get list of archives q = CollectionObject.item_collection_query() q = q.facet_by('archive_id', sort='count', mincount=1) \ .paginate(rows=0) # - depending on permissions, restrict to collections with researcher audio if not self.user.has_perm('collection.view_collection') and \ self.user.has_perm('collection.view_researcher_collection'): q = q.join('collection_id', 'pid', researcher_access=True) q = q.join('collection_id', 'pid', has_access_copy=True) # make a list of user-viewable archive pids archives = [pid for pid, count in q.execute().facet_counts.facet_fields['archive_id']] choices = [] # we need pid aliases keyed on pid for lookup pid_aliases_by_pid = dict([(v, k) for k, v in settings.PID_ALIASES.iteritems()]) for a in archives: if a in pid_aliases_by_pid: alias = pid_aliases_by_pid[a] # use the alias for *both* display and submit value choices.append((alias, alias.upper())) choices.insert(0, ('', '---')) # blank option at the beginning (default) return choices
def search_info(self): '''Generate a dictionary of search field and terms in a format that can be displayed to a user on the search results page.''' # don't do anything if the form isn't valid if not self.is_valid(): return search_info = {} for field, val in self.cleaned_data.iteritems(): if field in self.display_output_fields: # do not show display-formatting field values with search terms continue key = self.fields[ field].label # use form display label when available if key is None: # if field label is not set, use field name as a fall-back key = field if val: # if search value is not empty, selectively add it # for collections get collection object info if field == 'collection': search_info[key] = CollectionObject.find_by_pid(val) elif field == 'access_code': # for rights, numeric code + abbreviation search_info[key] = '%s - %s' % ( val, rights_access_terms_dict[val].abbreviation) elif field == "content_model": search_info[key] = dict(self.format_options)[val] elif field == "simpleCollection": search_info[key] = SimpleCollection.find_by_pid(val) elif val != self.fields[field].initial: # ignore default values search_info[key] = val return search_info
def search(request): '''Search for :class:`~keep.collection.models.CollectionObject` instances. ''' form = CollectionSearch(request.GET, prefix='collection') context = {'search': form} if form.is_valid(): # include all non-blank fields from the form as search terms search_opts = dict((key, val) for key, val in form.cleaned_data.iteritems() if val is not None and val != '') # but need to search by 0 # restrict to currently configured pidspace and collection content model search_opts.update({ 'pid': '%s:*' % settings.FEDORA_PIDSPACE, 'content_model': CollectionObject.COLLECTION_CONTENT_MODEL, }) # collect non-empty, non-default search terms to display to user on results page search_info = {} for field, val in form.cleaned_data.iteritems(): key = form.fields[field].label # use form display label if key is None: # if field label is not set, use field name as a fall-back key = field if val is not None and val != '': # if search value is not empty, selectively add it if hasattr(val, 'lstrip'): # solr strings can't start with wildcards extra_solr_cleaned = val.lstrip('*?') if val != extra_solr_cleaned: if not extra_solr_cleaned: messages.info(request, 'Ignoring search term "%s": Text fields can\'t start with wildcards.' % (val,)) del search_opts[field] continue messages.info(request, 'Searching for "%s" instead of "%s": Text fields can\'t start with wildcards.' % (extra_solr_cleaned, val)) val = extra_solr_cleaned search_opts[field] = val if field == 'archive_id': # for archive, get info search_info[key] = CollectionObject.find_by_pid(val) elif val != form.fields[field].initial: # ignore default values search_info[key] = val context['search_info'] = search_info solr = solr_interface() solrquery = solr.query(**search_opts).sort_by('source_id') # TODO: eventually, we'll need proper pagination here; # for now, set a large max to return everything context['results'] = solrquery.paginate(start=0, rows=1000).execute() # if the form was not valid, set the current instance of the form # as the sidebar form instance to display the error else: context['collection_search'] = form # render search results page; if there was an error, results will be displayed as empty return TemplateResponse(request, 'collection/search.html', context)
def archive_alias_choices(): choices = [] # we need pid aliases keyed on pid for lookup pid_aliases_by_pid = dict([(v, k) for k, v in settings.PID_ALIASES.iteritems()]) for a in CollectionObject.archives(format=dict): if a['pid'] in pid_aliases_by_pid: alias = pid_aliases_by_pid[a['pid']] # use the alias for *both* display and submit value choices.append((alias, alias.upper())) choices.insert(0, ('', '---')) # blank option at the beginning (default) return choices
def create_from_findingaid(request): form = FindCollection(request.POST) if not form.is_valid(): messages.error(request, 'Form is not valid; please try again.') else: data = form.cleaned_data q = CollectionObject.item_collection_query() # submitted value is pid alias; lookup pid for solr query archive_id = settings.PID_ALIASES[data['archive']] q = q.query(archive_id=archive_id, source_id=data['collection']) # if collection is found, redirect to collection view with message if q.count(): messages.info(request, 'Found %d collection%s for %s %s.' % (q.count(), 's' if q.count() != 1 else '', data['archive'].upper(), data['collection'])) return HttpResponseSeeOtherRedirect(reverse('collection:view', kwargs={'pid': q[0]['pid']})) else: # otherwise, create the new record and redirect to new # collection edit page repo = Repository(request=request) coll_id = data['collection'] coll = None try: archive = repo.get_object(archive_id, type=CollectionObject) fa = FindingAid.find_by_unitid(unicode(coll_id), archive.mods.content.title) coll = fa.generate_collection() coll.collection = archive coll.save() messages.info(request, 'Added %s for collection %s: %s' % (coll, coll_id, coll.mods.content.title)) return HttpResponseSeeOtherRedirect( reverse('collection:edit', kwargs={'pid': coll.pid})) except DoesNotExist: messages.error(request, 'No EAD found for %s in %s' % (coll_id, data['archive'].upper())) except ReturnedMultiple: messages.error(request, 'Multiple EADs found for %s in %s' % (coll_id, data['archive'].upper())) except RequestFailed as err: print err messages.error(request, 'Failed to save new collection') return HttpResponseSeeOtherRedirect(reverse('repo-admin:dashboard'))
def handle(self, numbering_pid, *ids, **options): verbosity = int(options['verbosity']) numbering = self.get_numbering(numbering_pid) if not numbering.exists: raise CommandError('Numbering scheme %s not found' % (numbering_pid, )) numbering_title = numbering.mods.content.title created = 0 errors = 0 for id in ids: # check for existing collection before creating new existing_coll = list( CollectionObject.find_by_collection_number(id, numbering.pid)) if existing_coll: print 'Collection %s already exists as %s' % \ (id, ', '.join([coll.pid for coll in existing_coll])) continue coll = None try: fa = FindingAid.find_by_unitid(id, numbering_title) coll = fa.generate_collection() # new collection parent collection is the archive collection object coll.collection = numbering if not options['dryrun']: coll.save() if verbosity: print 'Added %s for collection %s: %s (from %s)' % ( coll, id, coll.mods.content.title, numbering_title) created += 1 except DoesNotExist: print 'No EAD found for id %s in %s' % (id, numbering_title) errors += 1 except ReturnedMultiple: print 'Multiple EADs found for id %s in %s' % (id, numbering_title) errors += 1 except: if coll is not None: print 'Failed to save %s for collection %s: %s (from %s)' % ( coll, id, coll.mods.content.title, numbering_title) raise if verbosity > 1: print '%d records created' % (created, ) print '%d records failed' % (errors, )
def library_choices_by_user(self): # this method shouldn't be set if user isn't defined, but just in case if not self.user: return archive_choices() # NOTE: should be possible to query for archives directly, # but filtering on audio items requires two levels of joins, # and it's unclear how that actually works # use collection facet query to get list of archives q = CollectionObject.item_collection_query() q = q.facet_by('archive_id', sort='count', mincount=1) \ .paginate(rows=0) # - depending on permissions, restrict to collections with researcher content if not self.user.has_perm('collection.view_collection') and \ self.user.has_perm('collection.view_researcher_collection'): q = q.join('collection_id', 'pid', researcher_access=True) q = q.join('collection_id', 'pid', has_access_copy=True) facets = q.execute().facet_counts.facet_fields solr = solr_interface() archive_info = dict([(pid.replace('info:fedora/', ''), {'count': count}) for pid, count in facets['archive_id']]) # construct a boolean pid query to match any archive pids # in order to lookup titles and match them to pids pid_q = solr.Q() for pid in archive_info.keys(): pid_q |= solr.Q(pid=pid) query = solr.query(pid_q) \ .field_limit(['pid', 'title']) \ .sort_by('title') # ignore any spurious results that don't have titles (bad data in prod?) choices = [(a['pid'], a['title']) for a in query if 'title' in a] choices.insert(0, ('', '---')) # blank option at the beginning (default) return choices
def decompress(self, pid): # break single field value (pid) into multi-value needed for # multi-value field if pid: # main (hidden) value is collection id; if set, get collection # information to display as pre-set value in the visible field coll = CollectionObject.find_by_pid(pid) if coll: # if source id is available, include in label if 'source_id' in coll: label = '%(source_id)s %(title)s' % coll else: label = coll['title'] else: # fallback - should only happen if collection is not # indexed or pid is invalid logger.error('No collection information found for %s' % pid) label = '%s (title not found)' % pid return [pid, label] return [None, None]
def index_data(self): '''Extend the default :meth:`eulfedora.models.DigitalObject.index_data` method to include additional fields specific to Keep Audio objects.''' # NOTE: we don't want to rely on other objects being indexed in Solr, # so index data should not use Solr to find any related object info # FIXME: is it worth splitting out descriptive index data here? data = super(AudioObject, self).index_data() data['object_type'] = 'audio' if self.collection and self.collection.exists: # collection_source_id (0 is an allowable id, so check not None) if self.collection.mods.content.source_id is not None: data[ 'collection_source_id'] = self.collection.mods.content.source_id # FIXME: previously indexing URI; is this needed for any reason or can we # use pid? (needs to match collection index pid field for solr join) # data['collection_id'] = self.collection.uri data['collection_id'] = self.collection.pid try: # pull parent & archive collection objects directly from fedora parent = CollectionObject(self.api, self.collection.uri) data['collection_label'] = parent.label # NB: as of 2011-08-23, eulindexer doesn't support automatic # reindexing of audio objects when their collection changes. # as a result, archive_id and archive_label may be stale. # disable indexing them until eulindexer supports those # chained updates. #data['archive_id'] = parent.collection_id #archive = CollectionObject(self.api, parent.collection_id) #data['archive_label'] = archive.label except RequestFailed as rf: logger.error( 'Error accessing collection or archive object in Fedora: %s' % rf) # include resolvable ARK if available if self.mods.content.ark_uri: data['ark_uri'] = self.mods.content.ark_uri # old identifiers from previous digital masters dm1_ids = [] if self.mods.content.dm1_id: dm1_ids.append(self.mods.content.dm1_id) if self.mods.content.dm1_other_id: dm1_ids.append(self.mods.content.dm1_other_id) if dm1_ids: data['dm1_id'] = dm1_ids # digitization purpose, if not empty if self.digitaltech.content.digitization_purpose_list: # convert nodelist to a normal list that can be serialized as json data['digitization_purpose'] = [ dp for dp in self.digitaltech.content.digitization_purpose_list ] # related files if self.sourcetech.content.related_files_list: data['related_files'] = [ rel for rel in self.sourcetech.content.related_files_list ] # part note if self.mods.content.part_note and self.mods.content.part_note.text: data['part'] = self.mods.content.part_note.text # sublocation if self.sourcetech.content.sublocation: data['sublocation'] = self.sourcetech.content.sublocation # rights access status code if self.rights.content.access_status: data['access_code'] = self.rights.content.access_status.code # copyright date from rights metadata if self.rights.content.copyright_date: data['copyright_date'] = self.rights.content.copyright_date # ip note from rights metadata if self.rights.content.ip_note: data['ip_note'] = self.rights.content.ip_note # boolean values that should always be available data.update({ # should this item be accessible to researchers? 'researcher_access': bool(self.researcher_access), # if None, we want False # flags to indicate which datastreams are available 'has_access_copy': self.compressed_audio.exists, 'has_original': self.audio.exists, }) if self.compressed_audio.exists: data.update({ 'access_copy_size': self.compressed_audio.size, 'access_copy_mimetype': self.compressed_audio.mimetype, }) if self.digitaltech.content.duration: data['duration'] = self.digitaltech.content.duration if self.mods.content.origin_info and \ self.mods.content.origin_info.issued \ and not self.mods.content.origin_info.issued.is_empty(): data['date_issued'] = [ unicode(di) for di in self.mods.content.origin_info.issued ] if self.mods.content.origin_info and \ self.mods.content.origin_info.created \ and not self.mods.content.origin_info.created.is_empty(): data['date_created'] = [ unicode(di) for di in self.mods.content.origin_info.created ] if self.audio.exists: data['content_md5'] = self.audio.checksum return data
def browse_archive(request, archive): '''Browse a list of :class:`~keep.collection.models.CollectionObject` that belong to a specific archive. ''' # if archive is set, lookup pid in settings.PID_ALIASES # then do a collection object query for all collections in that archive archive_pid = settings.PID_ALIASES.get(archive, None) # 404 for unknown archive pid alias if archive_pid is None: raise Http404 # get archive object from fedora repo = Repository(request=request) archive_obj = repo.get_object(pid=archive_pid, type=CollectionObject) if not archive_obj.exists: raise Http404 q = CollectionObject.item_collection_query() # restrict to collections in this archive, sort by collection number # FIXME: should this be pid instead of uri? q = q.query(archive_id=archive_pid).sort_by('source_id') # - depending on permissions, restrict to collections with researcher audio if not request.user.has_perm('collection.view_collection') and \ request.user.has_perm('collection.view_researcher_collection'): q = q.join('collection_id', 'pid', researcher_access=True) q = q.join('collection_id', 'pid', has_access_copy=True) logger.debug('Solr query for collections in %s: %s' % \ (archive, unicode(q.query_obj))) # if no collections are found with current restraints and user # only has view_researcher_collection, forbid access to this page if not request.user.has_perm('collection.view_collection') and \ request.user.has_perm('collection.view_researcher_collection') and \ q.count() == 0: return prompt_login_or_403(request) # if a collection number is specified in url params, filter query collection_filter = None if request.GET.get('collection', None): collection_filter = request.GET['collection'] q = q.query(source_id=collection_filter) # paginate the solr result set paginator = Paginator(q, 30) try: page = int(request.GET.get('page', '1')) except ValueError: page = 1 try: collections = paginator.page(page) except (EmptyPage, InvalidPage): collections = paginator.page(paginator.num_pages) # url parameters for pagination links url_params = request.GET.copy() if 'page' in url_params: del url_params['page'] # there are currently two dates in the index; for display, # we want single date or date range only (not fedora timestamp) date_re = re.compile('\d{4}(-\d{4})?$') for c in collections.object_list: c['collection_dates'] = [] for d in c['date']: if date_re.match(d): c['collection_dates'].append(d) return TemplateResponse(request, 'collection/browse.html', {'archive': archive_obj, 'collections': collections, 'url_params': urlencode(url_params), 'collection_filter': collection_filter, 'find_collection': FindCollection(user=request.user)})
def list_archives(request, archive=None): '''List all top-level archive collections, with the total count of :class:`~keep.collection.models.CollectionObject` in each archive. .. Note:: Archives must be configured in **PID_ALIASES** in Django settings in order to be listed here. .. Note:: Within the code, top-level collections are referred to as "archives", but externally for users they should always be labeled as "Libraries." ''' # if params are set, search for collection if 'archive' in request.GET and 'collection' in request.GET: form = FindCollection(request.GET, user=request.user) if form.is_valid(): data = form.cleaned_data q = CollectionObject.item_collection_query() # submitted value is pid alias; lookup pid for solr query archive_id = settings.PID_ALIASES[data['archive']] q = q.query(archive_id=archive_id, source_id=data['collection']) # if exactly one result is found, redirect to the collection view if q.count() == 1: # give user some context for the redirect messages.info(request, 'One collection found for %s %s.' % (data['archive'].upper(), data['collection'])) return HttpResponseSeeOtherRedirect(reverse('collection:view', kwargs={'pid': q[0]['pid']})) # otherwise, if multiple, redirect to a filtered view of the archive browse elif q.count(): messages.info(request, '%d collections found for %s %s.' % (q.count(), data['archive'].upper(), data['collection'])) return HttpResponseSeeOtherRedirect('%s?%s' % \ (reverse('collection:browse-archive', kwargs={'archive': data['archive']}), urlencode({'collection': data['collection']}))) # if no matches, warn and return to archive display else: messages.warning(request, 'No collections found for %s %s.' % (data['archive'].upper(), data['collection'])) # values submitted but form not valid else: # TODO: better error message? messages.warning(request, 'Collection search input was not valid; please try again.') q = CollectionObject.item_collection_query() q = q.facet_by('archive_id', sort='count', mincount=1) \ .paginate(rows=0) # - depending on permissions, restrict to collections with researcher audio if not request.user.has_perm('collection.view_collection') and \ request.user.has_perm('collection.view_researcher_collection'): q = q.join('collection_id', 'pid', researcher_access=True) q = q.join('collection_id', 'pid', has_access_copy=True) facets = q.execute().facet_counts.facet_fields solr = solr_interface() archive_info = dict([(pid.replace('info:fedora/', ''), {'count': count}) for pid, count in facets['archive_id']]) # construct a boolean pid query to match any archive pids # in order to lookup titles and match them to pids pid_q = solr.Q() for pid in archive_info.keys(): pid_q |= solr.Q(pid=pid) query = solr.query(pid_q) \ .field_limit(['pid', 'title']) \ .sort_by('title') # pid aliases are keyed on the alias, but we need to look up by pid pid_aliases_by_pid = dict([(v, k) for k, v in settings.PID_ALIASES.iteritems()]) # add solr information and pid aliases to info dictionary for q in query: pid = q['pid'] if pid not in archive_info: continue # duplicate to make list of dict available to template for dictsort archive_info[pid]['pid'] = q['pid'] archive_info[pid]['title'] = q['title'] alias = pid_aliases_by_pid.get(pid, None) archive_info[pid]['alias'] = alias if alias is None: logger.warning('No pid alias found for archive %(pid)s (%(title)s)' \ % q) # prune any referenced archives that aren't actually indexed in solr # (should only happen in dev/qa) for pid in archive_info.keys(): if 'title' not in archive_info[pid] or archive_info[pid]['alias'] is None: del archive_info[pid] # NOTE: sending list of values (dictionaries) to allow sorting in template return TemplateResponse(request, 'collection/archives.html', {'archives': archive_info.values(), 'find_collection': FindCollection(user=request.user)})
def index_data(self): '''Extend the default :meth:`eulfedora.models.DigitalObject.index_data` method to include additional fields specific to Keep Video objects.''' # NOTE: we don't want to rely on other objects being indexed in Solr, # so index data should not use Solr to find any related object info data = super(Video, self).index_data() data['object_type'] = 'video' if self.collection and self.collection.exists: # collection_source_id (0 is an allowable id, so check not None) if self.collection.mods.content.source_id is not None: data[ 'collection_source_id'] = self.collection.mods.content.source_id data['collection_id'] = self.collection.pid try: # pull parent & archive collection objects directly from fedora parent = CollectionObject(self.api, self.collection.uri) data['collection_label'] = parent.label except RequestFailed as rf: logger.error( 'Error accessing collection or archive object in Fedora: %s' % rf) # include resolvable ARK if available if self.mods.content.ark_uri: data['ark_uri'] = self.mods.content.ark_uri #TODO May have to add these sections if more metada is added # # old identifiers from previous digital masters dm1_ids = [] if self.mods.content.dm1_id: dm1_ids.append(self.mods.content.dm1_id) if self.mods.content.dm1_other_id: dm1_ids.append(self.mods.content.dm1_other_id) if dm1_ids: data['dm1_id'] = dm1_ids # digitization purpose, if not empty if self.digitaltech.content.digitization_purpose_list: # convert nodelist to a normal list that can be serialized as json data['digitization_purpose'] = [ dp for dp in self.digitaltech.content.digitization_purpose_list ] # sublocation if self.sourcetech.content.sublocation: data['sublocation'] = self.sourcetech.content.sublocation # rights access status code if self.rights.content.access_status: data['access_code'] = self.rights.content.access_status.code # copyright date from rights metadata if self.rights.content.copyright_date: data['copyright_date'] = self.rights.content.copyright_date # ip note from rights metadata if self.rights.content.ip_note: data['ip_note'] = self.rights.content.ip_note # # # boolean values that should always be available data.update({ # should this item be accessible to researchers? 'researcher_access': bool(self.researcher_access), # flags to indicate which datastreams are available 'has_access_copy': self.access_copy.exists, 'has_original': self.content.exists, }) if self.access_copy.exists: data.update({ 'access_copy_size': self.access_copy.info.size, 'access_copy_mimetype': self.access_copy.mimetype, }) if self.digitaltech.content.duration: data['duration'] = self.digitaltech.content.duration if self.mods.content.origin_info and \ self.mods.content.origin_info.issued \ and not self.mods.content.origin_info.issued.is_empty(): data['date_issued'] = [ unicode(di) for di in self.mods.content.origin_info.issued ] if self.mods.content.origin_info and \ self.mods.content.origin_info.created \ and not self.mods.content.origin_info.created.is_empty(): data['date_created'] = [ unicode(di) for di in self.mods.content.origin_info.created ] # store master video format and size if self.provenance.content.object and self.provenance.content.object.format: data['content_format'] = self.provenance.content.object.format.name data['content_size'] = self.content.size return data
def archive_choices(): choices = [(a['pid'], a['title']) for a in CollectionObject.archives(format=dict)] choices.insert(0, ('', '')) # blank option at the beginning (default) return choices