Ejemplo n.º 1
0
 def test_solr_interface(self, mocksunburnt, mockhttplib):
     # basic init with no options
     solr_interface()
     # httplib2.Http should be initialized with defaults (no args, no cert)
     mockhttplib.Http.called_with(ca_certs=None)
     mocksunburnt.SolrInterface.assert_called_with(settings.SOLR_SERVER_URL,
         schemadoc=settings.SOLR_SCHEMA,
         http_connection=mockhttplib.Http.return_value)
Ejemplo n.º 2
0
 def test_solr_interface(self, mocksunburnt, mockhttplib):
     # basic init with no options
     solr_interface()
     # httplib2.Http should be initialized with defaults (no args, no cert)
     mockhttplib.Http.called_with(ca_certs=None)
     mocksunburnt.SolrInterface.assert_called_with(
         settings.SOLR_SERVER_URL,
         schemadoc=settings.SOLR_SCHEMA,
         http_connection=mockhttplib.Http.return_value)
Ejemplo n.º 3
0
 def test_solr_interface_cert(self, mocksunburnt, mockhttplib):
     # init with a ca cert
     settings.SOLR_CA_CERT_PATH = '/some/path/to/certs'
     solr_interface()
     # httplib should be initialized with ca_certs option
     mockhttplib.Http.assert_called_with(
         ca_certs=settings.SOLR_CA_CERT_PATH
         # proxy_info=mockhttplib.ProxyInfo.return_value
     )
Ejemplo n.º 4
0
 def test_solr_interface_cert(self, mocksunburnt, mockhttplib):
     # init with a ca cert
     settings.SOLR_CA_CERT_PATH = '/some/path/to/certs'
     solr_interface()
     # httplib should be initialized with ca_certs option
     mockhttplib.Http.assert_called_with(
         ca_certs=settings.SOLR_CA_CERT_PATH
         # proxy_info=mockhttplib.ProxyInfo.return_value
     )
Ejemplo n.º 5
0
    def test_search_collections(self, mockpaginator, mocksolr_interface,
                                mocksearch_libs):
        solr = solr_interface()
        search_url = reverse('search:keyword')
        mocksolr = mocksolr_interface.return_value
        mocksolr.Q = MagicMock(solr.Q)

        mocksolr.query.return_value = mocksolr.query
        for method in [
                'query', 'facet_by', 'sort_by', 'field_limit', 'exclude',
                'filter'
        ]:
            getattr(mocksolr.query, method).return_value = mocksolr.query

        # create researcher IP for localhost so anonymous access will be
        # treated as anonymous researcher
        researchip = ResearcherIP(name='test client', ip_address='127.0.0.1')
        researchip.save()

        response = self.client.get(search_url, {'collection': '1000'})
        # check solr query args
        # - collection should trigger OR query against collection label and number fields
        mocksolr.Q.assert_any_call(collection_label='1000')
        mocksolr.Q.assert_any_call(collection_source_id='1000')
        # NOTE: not checking OR query direclty because unclear how to replicate in mock

        self.assertContains(response,
            '<input class="form-control" id="id_collection" name="collection" placeholder="Search by collection name or number" type="text" value="%s">' % \
            '1000',
            html=True,
            msg_prefix='collection search value should be displayed on result page via form')

        researchip.delete()
Ejemplo n.º 6
0
    def archives(format=None):
        """Find Archives objects, to which CollectionObjects belong.

        :returns: list of :class:`CollectionObject`
        :rtype: list
        """
        # NOTE: formerly called top-level collections or Repository /
        # Owning Repository; should now be called archive and labeled
        # as such anywhere user-facing

        # TODO: search logic very similar to item_collections and
        # subcollections methods; consider refactoring search logic
        # into a common search method.

        if CollectionObject._archives is None:
            # find all objects with cmodel collection-1.1 and no parents

            # search solr for collection objects with NO parent collection id
            solr = solr_interface()
            # NOTE: not filtering on pidspace, since top-level objects are loaded as fixtures
            # and may not match the configured pidspace in a dev environment
            solrquery = solr.query(content_model=CollectionObject.COLLECTION_CONTENT_MODEL)
            collections = solrquery.exclude(archive_id__any=True).sort_by('title_exact').execute()
            # store the solr response format
            CollectionObject._archives = collections

        if format == dict:
            return CollectionObject._archives

        # otherwise, initialize as instances of CollectionObject
        repo = Repository()
        return [repo.get_object(arch['pid'], type=CollectionObject)
                                                       for arch in CollectionObject._archives]
Ejemplo n.º 7
0
    def find_by_collection_number(num, parent=None):
        '''Find a CollectionObject in Fedora by collection number (or
        source id), optionally limited by parent collection (owning
        archive).

        :param num: collection number to search for (aka source id)
        :param parent: optional; archive that the collection must belong to
        :return: generator of any matching items, as instances of
            :class:`CollectionObject`
        '''
        solr = solr_interface()
        solrquery = solr.query(content_model=CollectionObject.COLLECTION_CONTENT_MODEL,
                               pid='%s:*' % settings.FEDORA_PIDSPACE,
                               source_id=int(num))
        # if parent is specified, restrict by archive id (parent should be a pid)
        if parent is not None:
            # remove prefix on parent
            prefix ='info:fedora/'
            if parent.startswith(prefix):
                parent = parent[12:]
            solrquery = solrquery.query(archive_id=parent)
        # by default, only returns 10; get everything
        # - solr response is a list of dictionary with collection info
        # use dictsort in template for sorting where appropriate
        collections = solrquery.paginate(start=0, rows=1000).execute()

        # return a generator of matching items, as instances of CollectionObject
        repo = Repository()
        for coll in collections:
            yield repo.get_object(coll['pid'], type=CollectionObject)
Ejemplo n.º 8
0
    def _duplicate_exists(self, cleaned_data):
        """Determine if saving this form would create a duplicate
        collection. Specifically, verify that there is no other collection
        with the same collection (archive) and source_id present in solr.
        """
        collection = cleaned_data.get('collection')
        source_id = cleaned_data.get('source_id')

        solr = solr_interface()
        query = solr.query(
                content_model=CollectionObject.COLLECTION_CONTENT_MODEL,
                source_id=source_id, archive_id=collection)
        response = query.execute()

        # if there are no matches then this is definitely not a
        if response.result.numFound == 0:
            return False

        if response.result.numFound > 1:
            # if there's already more than one match then this is definitely
            # a duplicate
            return True

        # otherwise there's exactly one. if it's this object then this *is*
        # the collection with that archive/id.
        return (response[0]['pid'] != self.object_instance.pid)
Ejemplo n.º 9
0
    def test_solr_interface_proxy(self, mocksunburnt, mockhttplib):
        # init with an http proxy set in env
        os.environ['HTTP_PROXY'] = 'http://localhost:3128/'
        solr_interface()
        # proxy info should be configured & passed to httplib2
        mockhttplib.ProxyInfo.assert_called_with(proxy_type=mockhttplib.socks.PROXY_TYPE_HTTP_NO_TUNNEL,
                                              proxy_host='localhost', proxy_port=3128)
        mockhttplib.Http.assert_called_with(
            proxy_info=mockhttplib.ProxyInfo.return_value,
            ca_certs=settings.SOLR_CA_CERT_PATH)

        # when solr url is https, no proxy should be set
        mockhttplib.reset_mock()
        settings.SOLR_SERVER_URL = 'https://test.solr/'
        solr_interface()
        mockhttplib.ProxyInfo.assert_not_called()
        # no args except default cert path
        mockhttplib.Http.assert_called_with(ca_certs=settings.SOLR_CA_CERT_PATH)
Ejemplo n.º 10
0
def search(request):
    '''Search for :class:`~keep.collection.models.CollectionObject`
    instances.
    '''
    form = CollectionSearch(request.GET, prefix='collection')
    context = {'search': form}
    if form.is_valid():
        # include all non-blank fields from the form as search terms
        search_opts = dict((key, val)
                           for key, val in form.cleaned_data.iteritems()
                           if val is not None and val != '')  # but need to search by 0
        # restrict to currently configured pidspace and collection content model
        search_opts.update({
            'pid': '%s:*' % settings.FEDORA_PIDSPACE,
            'content_model': CollectionObject.COLLECTION_CONTENT_MODEL,
            })

        # collect non-empty, non-default search terms to display to user on results page
        search_info = {}
        for field, val in form.cleaned_data.iteritems():
            key = form.fields[field].label  # use form display label
            if key is None:     # if field label is not set, use field name as a fall-back
                key = field

            if val is not None and val != '':     # if search value is not empty, selectively add it
                if hasattr(val, 'lstrip'):  # solr strings can't start with wildcards
                    extra_solr_cleaned = val.lstrip('*?')
                    if val != extra_solr_cleaned:
                        if not extra_solr_cleaned:
                            messages.info(request, 'Ignoring search term "%s": Text fields can\'t start with wildcards.' % (val,))
                            del search_opts[field]
                            continue
                        messages.info(request, 'Searching for "%s" instead of "%s": Text fields can\'t start with wildcards.' %
                                      (extra_solr_cleaned, val))
                        val = extra_solr_cleaned
                        search_opts[field] = val

                if field == 'archive_id':       # for archive, get  info
                    search_info[key] = CollectionObject.find_by_pid(val)
                elif val != form.fields[field].initial:     # ignore default values
                    search_info[key] = val
        context['search_info'] = search_info

        solr = solr_interface()
        solrquery = solr.query(**search_opts).sort_by('source_id')
        # TODO: eventually, we'll need proper pagination here;
        # for now, set a large max to return everything
        context['results'] = solrquery.paginate(start=0, rows=1000).execute()

    # if the form was not valid, set the current instance of the form
    # as the sidebar form instance to display the error
    else:
        context['collection_search'] = form

    # render search results page; if there was an error, results will be displayed as empty
    return TemplateResponse(request, 'collection/search.html', context)
Ejemplo n.º 11
0
    def test_solr_interface_proxy(self, mocksunburnt, mockhttplib):
        # init with an http proxy set in env
        os.environ['HTTP_PROXY'] = 'http://localhost:3128/'
        solr_interface()
        # proxy info should be configured & passed to httplib2
        mockhttplib.ProxyInfo.assert_called_with(
            proxy_type=mockhttplib.socks.PROXY_TYPE_HTTP_NO_TUNNEL,
            proxy_host='localhost',
            proxy_port=3128)
        mockhttplib.Http.assert_called_with(
            proxy_info=mockhttplib.ProxyInfo.return_value,
            ca_certs=settings.SOLR_CA_CERT_PATH)

        # when solr url is https, no proxy should be set
        mockhttplib.reset_mock()
        settings.SOLR_SERVER_URL = 'https://test.solr/'
        solr_interface()
        mockhttplib.ProxyInfo.assert_not_called()
        # no args except default cert path
        mockhttplib.Http.assert_called_with(
            ca_certs=settings.SOLR_CA_CERT_PATH)
Ejemplo n.º 12
0
    def find_file_object(self, file_path):
        '''Find a file object by checksum in fedora based on a file
        path.  Returns a file object if one matches the checksum for
        the file specified, or else None if no match is found.

        :returns:  :class:`keep.arrangement.models.RushdieArrangementFile` or
        None
        '''
        file_md5 = md5sum(file_path)
        solr = solr_interface()
        q = solr.query(content_md5=file_md5).field_limit('pid')
        if len(q):
            return self.repo.get_object(q[0]['pid'], type=RushdieArrangementFile)
Ejemplo n.º 13
0
    def item_collection_query():
        """Solr query to find all collection objects in the configured
        Fedora pidspace that can contain items. Currently this includes
        all collections that belong to an archive.

        :returns: list of dict
        :rtype: list
        """

        # search solr for collection objects with NO parent collection id
        solr = solr_interface()
        return solr.query(content_model=CollectionObject.COLLECTION_CONTENT_MODEL,
                               archive_id__any=True)
Ejemplo n.º 14
0
def collection_suggest(request):
    '''Suggest view for collections, for use with use with `JQuery UI
    Autocomplete`_ widget.  Searches for collections on all of the
    terms passed in (as multiple keywords), similar to the way the
    combined search works.

    .. _JQuery UI Autocomplete: http://jqueryui.com/demos/autocomplete/

    :param request: the http request passed to the original view
        method (used to retrieve the search term)
    '''
    term = request.GET.get('term', '')

    suggestions = []

    if term:
        # If the search term doesn't end in space, add a wildcard to
        # the last word to allow for partial word matching.
        if term[-1] != ' ':
            term += '*'
        terms = search_terms(term)

        solr = solr_interface()
        # common query parameters and options
        base_query = solr.query() \
                    .filter(content_model=CollectionObject.COLLECTION_CONTENT_MODEL) \
                    .field_limit(['pid', 'source_id', 'title', 'archive_short_name',
                                  'creator', 'archive_id']) \
                    .sort_by('-score')

        q = base_query.query(terms)

        # NOTE: there seems to be a Lucene/Solr bug/quirk where adding
        # a wildcard at the end of a word causes Solr not to match the
        # exact word (even though docs indicate this should work).
        # As a work-around, if we added a * and got 0 results,
        # try the search again without the wildcard.
        if term[-1] == '*' and q.count() == 0:
            q = base_query.query(search_terms(term[:-1]))
            #Exclude archival collection (Top-level library)
        q=q.filter(archive_id__any=True)

        suggestions = [{'label': '%s %s' % (c.get('source_id', ''),
                                            c.get('title', '(no title')),
                        'value': c['pid'],  # FIXME: do we need URI here?
                        'category':c.get('archive_short_name', ''),
                        'desc': c.get('creator', '')}
                       for c in q[:15]]

    return HttpResponse(json_serializer.encode(suggestions),
                         content_type='application/json')
Ejemplo n.º 15
0
    def subcollections(self):
        """Find all sub-collections that are members of the current collection
        in the configured Fedora pidspace.

        :rtype: list of dict
        """
        solr = solr_interface()
        solrquery = solr.query(content_model=CollectionObject.COLLECTION_CONTENT_MODEL,
                               pid='%s:' % settings.FEDORA_PIDSPACE,
                               archive_id=self.pid)
        # by default, only returns 10; get everything
        # - solr response is a list of dictionary with collection info
        # use dictsort in template for sorting where appropriate
        return solrquery.paginate(start=0, rows=1000).execute()
Ejemplo n.º 16
0
    def find_by_pid(pid):
        'Find a collection by pid and return a dictionary with collection information.'
        # NOTE: this method added as a replacement for
        # get_cached_collection_dict that was used elsewhere
        # throughout the site (audio app, etc.)  It should probably be
        # consolidated with other find methods...

        if pid.startswith('info:fedora/'):  # allow passing in uri
            pid = pid[len('info:fedora/'):]
        solr = solr_interface()
        solrquery = solr.query(content_model=SimpleCollection.COLLECTION_CONTENT_MODEL,
                               pid=pid)
        result = solrquery.execute()
        if len(result) == 1:
            return result[0]
Ejemplo n.º 17
0
    def disk_images(self):
        self.stderr.write('Disk images')
        ### disk images
        # representative sample of aff and ad1
        # DO NOT include anything in these collections:
        # Trethewey (ghsdj), Rushdie (94k9k), Mackey (g1btw),
        # Clifton (94kf4), and Grennan (9k0st)

        solr = solr_interface()
        repo = Repository()
        q = solr.query(content_model=DiskImage.DISKIMAGE_CONTENT_MODEL) \
                .exclude(collection_id=self.collections['trethewey']) \
                .exclude(collection_id=self.collections['rushdie']) \
                .exclude(collection_id=self.collections['mackey']) \
                .exclude(collection_id=self.collections['clifton']) \
                .exclude(collection_id=self.collections['grennan']) \
                .field_limit('pid')
        if self.verbosity >= self.v_normal:
            self.stderr.write(
                'Found %d disk images not in restricted collections' %
                q.count())

        # currently there is no way to filter on format or size in either
        # solr or fedora risearch
        # so, go through individually and group them by type,
        # then sort by size and pick the smallest ones
        diskimgs_by_type = defaultdict(list)
        for result in q:
            diskimg = repo.get_object(result['pid'], type=DiskImage)
            if not diskimg.exists:
                if self.verbosity >= self.v_normal:
                    self.stderr.write('Referenced disk image %s does not exist or is inaccessible' \
                        % result['pid'])
                continue

            fmt = diskimg.provenance.content.object.format.name
            diskimgs_by_type[fmt].append(diskimg)

        for fmt, diskimages in diskimgs_by_type.iteritems():
            if self.verbosity >= self.v_normal:
                self.stderr.write('Selecting %s disk images' % fmt)
            # sort on binary file size so we sync the smallest ones
            diskimages = sorted(diskimages,
                                key=lambda diskimg: diskimg.content.size)
            # use the first 10 of each type
            for d in diskimages[:10]:
                self.stdout.write(d.pid)
Ejemplo n.º 18
0
    def simple_collections():
        """Find all simpleCollection objects in the configured Fedora
        pidspace that can contain items.

        :returns: list of dict
        :rtype: list
        """

        # search solr for simpleCollection objects
        solr = solr_interface()
        solrquery = solr.query(content_model=SimpleCollection.COLLECTION_CONTENT_MODEL, \
                    type=REPO.SimpleCollection)

        # by default, only returns 10; get everything
        # - solr response is a list of dictionary with collection info
        # use dictsort and regroup in templates for sorting where appropriate
        return solrquery.paginate(start=0, rows=1000).execute()
Ejemplo n.º 19
0
    def find_by_field(field, value, repo=None):
        '''
        Static method to find a single :class:`EmailMessage` by an indexed
        value.  Looks for the item in Solr and
        returns an :class:`EmailMessage` instance initialized
        from the repository if a single match is found for the
        requested field and value.

        Raises :class:`django.core.exceptions.MultipleObjectsReturned`
        if more than one match is found; raises
        :class:`django.core.exceptions.ObjectDoesNotExist` if no
        matches are found in the Solr index.

        :param field: solr field to search
        :param value: value to search on in the specified field

        :param repo: optional :class:`eulfedora.server.Repository`
            to use an existing connection with specific credentials

        :returns: :class:`EmailMessage`


        '''
        solr = solr_interface()
        search_terms = {
            field: value,
            'content_model': ArrangementObject.ARRANGEMENT_CONTENT_MODEL
        }
        q = solr.query(**search_terms).field_limit('pid')

        # check that we found one and only one
        found = len(q)
        # borrowing custom django exceptions for not found / too many
        # matches
        if found > 1:
            raise MultipleObjectsReturned('Found %d records with %s %s' % \
                                          (found, field, value))
        if not found:
            raise ObjectDoesNotExist('No record found with %s %s' %
                                     (field, value))

        if repo is None:
            repo = Repository()

        return repo.get_object(q[0]['pid'], type=EmailMessage)
Ejemplo n.º 20
0
    def disk_images(self):
        self.stderr.write('Disk images')
        ### disk images
        # representative sample of aff and ad1
        # DO NOT include anything in these collections:
        # Trethewey (ghsdj), Rushdie (94k9k), Mackey (g1btw),
        # Clifton (94kf4), and Grennan (9k0st)

        solr = solr_interface()
        repo = Repository()
        q = solr.query(content_model=DiskImage.DISKIMAGE_CONTENT_MODEL) \
                .exclude(collection_id=self.collections['trethewey']) \
                .exclude(collection_id=self.collections['rushdie']) \
                .exclude(collection_id=self.collections['mackey']) \
                .exclude(collection_id=self.collections['clifton']) \
                .exclude(collection_id=self.collections['grennan']) \
                .field_limit('pid')
        if self.verbosity >= self.v_normal:
            self.stderr.write('Found %d disk images not in restricted collections' % q.count())

        # currently there is no way to filter on format or size in either
        # solr or fedora risearch
        # so, go through individually and group them by type,
        # then sort by size and pick the smallest ones
        diskimgs_by_type = defaultdict(list)
        for result in q:
            diskimg = repo.get_object(result['pid'], type=DiskImage)
            if not diskimg.exists:
                if self.verbosity >= self.v_normal:
                    self.stderr.write('Referenced disk image %s does not exist or is inaccessible' \
                        % result['pid'])
                continue

            fmt = diskimg.provenance.content.object.format.name
            diskimgs_by_type[fmt].append(diskimg)

        for fmt, diskimages in diskimgs_by_type.iteritems():
            if self.verbosity >= self.v_normal:
                self.stderr.write('Selecting %s disk images' % fmt)
            # sort on binary file size so we sync the smallest ones
            diskimages = sorted(diskimages, key=lambda diskimg: diskimg.content.size)
            # use the first 10 of each type
            for d in diskimages[:10]:
                self.stdout.write(d.pid)
Ejemplo n.º 21
0
    def find_by_field(field, value, repo=None):
        '''
        Static method to find a single :class:`EmailMessage` by an indexed
        value.  Looks for the item in Solr and
        returns an :class:`EmailMessage` instance initialized
        from the repository if a single match is found for the
        requested field and value.

        Raises :class:`django.core.exceptions.MultipleObjectsReturned`
        if more than one match is found; raises
        :class:`django.core.exceptions.ObjectDoesNotExist` if no
        matches are found in the Solr index.

        :param field: solr field to search
        :param value: value to search on in the specified field

        :param repo: optional :class:`eulfedora.server.Repository`
            to use an existing connection with specific credentials

        :returns: :class:`EmailMessage`


        '''
        solr = solr_interface()
        search_terms = {
            field: value,
            'content_model': ArrangementObject.ARRANGEMENT_CONTENT_MODEL
        }
        q = solr.query(**search_terms).field_limit('pid')

        # check that we found one and only one
        found = len(q)
        # borrowing custom django exceptions for not found / too many
        # matches
        if found > 1:
            raise MultipleObjectsReturned('Found %d records with %s %s' % \
                                          (found, field, value))
        if not found:
            raise ObjectDoesNotExist('No record found with %s %s' % (field, value))

        if repo is None:
            repo = Repository()

        return repo.get_object(q[0]['pid'], type=EmailMessage)
Ejemplo n.º 22
0
    def by_arrangement_id(id, repo=None):
        '''
        Static method to find an :class:`ArrangementObject` by its
        local or arrangement id.  Looks for the item in Solr and
        returns an :class:`ArrangementObject` instance initialized
        from the repository if a single match is found for the
        requested id.

        Raises :class:`django.core.exceptions.MultipleObjectsReturned`
        if more than one match is found; raises
        :class:`django.core.exceptions.ObjectDoesNotExist` if no
        matches are found in the Solr index.

        :param id: arrangement id or local id

        :param repo: optional :class:`eulfedora.server.Repository`
            to use an existing connection with specific credentials

        :returns: :class:`ArrangementObject`


        '''
        solr = solr_interface()
        q = solr.query(arrangement_id=id,
                   content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) \
                   .field_limit('pid')

        # check that we found one and only one
        found = len(q)
        # borrowing custom django exceptions for not found / too many
        # matches
        if found > 1:
            raise MultipleObjectsReturned('Found %d records with arrangement id %s' % \
                                          (found, id))
        if not found:
            raise ObjectDoesNotExist('No record found with arrangement id %s' %
                                     id)

        if repo is None:
            repo = Repository()

        return repo.get_object(q[0]['pid'], type=ArrangementObject)
Ejemplo n.º 23
0
def view_item(request, pid):
    '''
    Display information about a single object.  Currently
    only supports :class:`eulcm.models.boda.EmailMessage`
    and :class:`eulcm.models.boda.Mailbox` objects.

    :param pid: The pid of the object to be displayed.
    '''

    repo = TypeInferringRepository(request=request)
    obj = repo.get_object(pid)
    context = {'obj': obj}
    if isinstance(obj, boda.EmailMessage):
        template_name = 'arrangement/email_view.html'
    elif isinstance(obj, boda.Mailbox):
        template_name = 'arrangement/mailbox_view.html'

        # use Solr to find paginated messages in this mailbox
        solr = solr_interface()
        q = solr.query(isPartOf=obj.uri)
        paginator = Paginator(q, 30)
        try:
            page = int(request.GET.get('page', '1'))
        except ValueError:
            page = 1
        try:
            results = paginator.page(page)
        except (EmptyPage, InvalidPage):
            results = paginator.page(paginator.num_pages)
        # calculate page links to show
        show_pages = pages_to_show(paginator, page)
        # add paginated messages to context
        context.update({
            'page': results,
            'show_pages': show_pages,
            'search_opts': request.GET.urlencode()
        })
    else:
        raise Http404

    return TemplateResponse(request, template_name, context)
Ejemplo n.º 24
0
    def by_arrangement_id(id, repo=None):
        '''
        Static method to find an :class:`ArrangementObject` by its
        local or arrangement id.  Looks for the item in Solr and
        returns an :class:`ArrangementObject` instance initialized
        from the repository if a single match is found for the
        requested id.

        Raises :class:`django.core.exceptions.MultipleObjectsReturned`
        if more than one match is found; raises
        :class:`django.core.exceptions.ObjectDoesNotExist` if no
        matches are found in the Solr index.

        :param id: arrangement id or local id

        :param repo: optional :class:`eulfedora.server.Repository`
            to use an existing connection with specific credentials

        :returns: :class:`ArrangementObject`


        '''
        solr = solr_interface()
        q = solr.query(arrangement_id=id,
                   content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) \
                   .field_limit('pid')

        # check that we found one and only one
        found = len(q)
        # borrowing custom django exceptions for not found / too many
        # matches
        if found > 1:
            raise MultipleObjectsReturned('Found %d records with arrangement id %s' % \
                                          (found, id))
        if not found:
            raise ObjectDoesNotExist('No record found with arrangement id %s' % id)

        if repo is None:
            repo = Repository()

        return repo.get_object(q[0]['pid'], type=ArrangementObject)
Ejemplo n.º 25
0
def view_item(request, pid):
    '''
    Display information about a single object.  Currently
    only supports :class:`eulcm.models.boda.EmailMessage`
    and :class:`eulcm.models.boda.Mailbox` objects.

    :param pid: The pid of the object to be displayed.
    '''

    repo = TypeInferringRepository(request=request)
    obj = repo.get_object(pid)
    context = {'obj': obj}
    if isinstance(obj, boda.EmailMessage):
        template_name = 'arrangement/email_view.html'
    elif isinstance(obj, boda.Mailbox):
        template_name = 'arrangement/mailbox_view.html'

        # use Solr to find paginated messages in this mailbox
        solr = solr_interface()
        q = solr.query(isPartOf=obj.uri)
        paginator = Paginator(q, 30)
        try:
            page = int(request.GET.get('page', '1'))
        except ValueError:
            page = 1
        try:
            results = paginator.page(page)
        except (EmptyPage, InvalidPage):
            results = paginator.page(paginator.num_pages)
        # calculate page links to show
        show_pages = pages_to_show(paginator, page)
        # add paginated messages to context
        context.update({
            'page': results,
            'show_pages': show_pages,
            'search_opts': request.GET.urlencode()
        })
    else:
        raise Http404

    return TemplateResponse(request, template_name, context)
Ejemplo n.º 26
0
    def library_choices_by_user(self):
        # this method shouldn't be set if user isn't defined, but just in case
        if not self.user:
            return archive_choices()

        # NOTE: should be possible to query for archives directly,
        # but filtering on audio items requires two levels of joins,
        # and it's unclear how that actually works

        # use collection facet query to get list of archives
        q = CollectionObject.item_collection_query()
        q = q.facet_by('archive_id', sort='count', mincount=1) \
              .paginate(rows=0)

        # - depending on permissions, restrict to collections with researcher content
        if not self.user.has_perm('collection.view_collection') and \
               self.user.has_perm('collection.view_researcher_collection'):
            q = q.join('collection_id', 'pid', researcher_access=True)
            q = q.join('collection_id', 'pid', has_access_copy=True)

        facets = q.execute().facet_counts.facet_fields

        solr = solr_interface()
        archive_info = dict([(pid.replace('info:fedora/', ''), {'count': count})
                        for pid, count in facets['archive_id']])

        # construct a boolean pid query to match any archive pids
        # in order to lookup titles and match them to pids
        pid_q = solr.Q()
        for pid in archive_info.keys():
            pid_q |= solr.Q(pid=pid)
        query = solr.query(pid_q) \
                    .field_limit(['pid', 'title']) \
                    .sort_by('title')

        # ignore any spurious results that don't have titles (bad data in prod?)
        choices = [(a['pid'], a['title']) for a in query
                    if 'title' in a]
        choices.insert(0, ('', '---'))   # blank option at the beginning (default)
        return choices
Ejemplo n.º 27
0
    def rushdie_files(self):
        self.stderr.write('Rushdie files')
        solr = solr_interface()

        ### individual rushdie files
        # select 100 individual rushdie files to simulate the way they
        # currently clutter up born-digital search in production
        q = solr.query(content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL,
                       collection_id=self.collections['rushdie']).field_limit('pid')
        self.stderr.write('Found %d Rushdie arrangement objects' % q.count())
        # over 6000 of these in production; pull a subset and randomize
        # to ensure diversity, get chunks from various points in the results
        pids = [r['pid'] for r in q[:100]]
        pids.extend([r['pid'] for r in q[1000:1100]])
        pids.extend([r['pid'] for r in q[2000:2100]])
        pids.extend([r['pid'] for r in q[3000:3100]])
        pids.extend([r['pid'] for r in q[4000:4100]])
        pids.extend([r['pid'] for r in q[5000:5100]])
        pids.extend([r['pid'] for r in q[6000:6100]])
        # then shuffle that and pick the first 100
        random.shuffle(pids)
        for p in pids[:100]:
            self.stdout.write(p)
Ejemplo n.º 28
0
    def save(self, logMessage=None):
        # check for duplicate content before initial ingest
        if self._create and self.content_md5 is not None:
            solr = solr_interface()
            q = solr.query(content_md5=self.content_md5).field_limit(
                ['pid', 'content_model'])
            # if a duplicate is found, raise custom exception with info on the dupes
            if q.count():
                msg = 'Detected %s duplicate record%s' % \
                    (q.count(), 's' if q.count() != 1 else '')

                results = list(q)
                pids = [r['pid'] for r in results]
                # dictionary of pid : list of cmodels
                pid_cmodels = dict([(r['pid'], r['content_model'])
                                    for r in results])

                raise DuplicateContent(msg, pids, pid_cmodels)

        # update the ark label in pidman when there is a name conflict
        self.update_ark_label()

        return super(DigitalObject, self).save(logMessage)
Ejemplo n.º 29
0
    def test_search_bylibrary(self, mockpaginator, mocksolr_interface,
                              mocksearch_libs):
        solr = solr_interface()
        search_url = reverse('search:keyword')
        mocksolr = mocksolr_interface.return_value
        mocksolr.Q = MagicMock(solr.Q)

        mocksolr.query.return_value = mocksolr.query
        for method in [
                'query', 'facet_by', 'sort_by', 'field_limit', 'exclude',
                'filter', 'join'
        ]:
            getattr(mocksolr.query, method).return_value = mocksolr.query

        # create researcher IP for localhost so anonymous access will be
        # treated as anonymous researcher
        researchip = ResearcherIP(name='test client', ip_address='127.0.0.1')
        researchip.save()

        # NOTE: currently uses info:fedora/ pid format for select, so use that here
        libpid = 'info:fedora/%s' % settings.PID_ALIASES['marbl']
        marbl_name = 'Manuscript, Archives, and Rare Book Library'
        # set mock list of library choices to include our test value so form will be valid
        mocksearch_libs.return_value = [(libpid, marbl_name)]
        response = self.client.get(search_url, {'library': libpid})
        # check solr query args
        # - date should query dates created and issued explicitly
        mocksolr.query.join.assert_called_with('pid',
                                               'collection_id',
                                               archive_id=libpid)

        self.assertContains(
            response,
            '<option value="%s" selected="selected">%s</option>' %
            (libpid, marbl_name),
            html=True,
            msg_prefix='library filter should be selected on result page form')
Ejemplo n.º 30
0
    def rushdie_files(self):
        self.stderr.write('Rushdie files')
        solr = solr_interface()

        ### individual rushdie files
        # select 100 individual rushdie files to simulate the way they
        # currently clutter up born-digital search in production
        q = solr.query(
            content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL,
            collection_id=self.collections['rushdie']).field_limit('pid')
        self.stderr.write('Found %d Rushdie arrangement objects' % q.count())
        # over 6000 of these in production; pull a subset and randomize
        # to ensure diversity, get chunks from various points in the results
        pids = [r['pid'] for r in q[:100]]
        pids.extend([r['pid'] for r in q[1000:1100]])
        pids.extend([r['pid'] for r in q[2000:2100]])
        pids.extend([r['pid'] for r in q[3000:3100]])
        pids.extend([r['pid'] for r in q[4000:4100]])
        pids.extend([r['pid'] for r in q[5000:5100]])
        pids.extend([r['pid'] for r in q[6000:6100]])
        # then shuffle that and pick the first 100
        random.shuffle(pids)
        for p in pids[:100]:
            self.stdout.write(p)
Ejemplo n.º 31
0
def keyword_search(request):
    '''Combined keyword search across all :mod:`keep` repository
    items.
    '''
    searchform = KeywordSearch(request.GET)
    missing_label = '[null]'

    ctx = {'form': searchform}
    if searchform.is_valid():
        search_terms = searchform.cleaned_data['keyword']

        solr = solr_interface()
        # start with a default query to add filters & search terms
        # *first* filter to restrict to content models user has permission to view
        # q = filter_by_perms(solr.query(), request.user)
        q = solr.query()

        # optional date filter for fixity check
        fixity_check_mindate = searchform.cleaned_data.get('fixity_check_mindate', None)
        if fixity_check_mindate:
            today = date.today()
            q = q.query(last_fixity_check__range=(fixity_check_mindate, today))

        # use solr grouping queries to cluster original and migrated objects
        # if they appear in the same search result set
        q = q.group_by('original_pid', limit=5, sort='created desc', format='simple')

        # separate out normal and fielded search terms in keyword search string
        # TODO: should this logic be shifted to form validation/cleaning?
        search_info = MultiValueDict()
        terms = []
        # add field-based search terms to query and search info for display
        for t in search_terms:
            field, val = t
            # add non-field terms to list of terms
            # - no field name
            if field is None:
                terms.append(val)
            # - unrecognized field name or incomplete term
            elif val is None or field not in searchform.allowed_fields:
                # just search on the text we were given
                if val is None:
                    term = '%s:' % field
                else:
                    if ' ' in val:  # assume exact phrase if quoted
                        val = "%s" % val
                    term = '%s:%s' % (field, val)
                terms.append(term)

            # field/value pair
            else:
                solr_field = searchform.allowed_fields[field]
                search_val = val
                # special case for searching for collection source id
                if field == 'coll' and search_val and search_val.isdigit():
                    solr_field = 'collection_source_id'
                # add wildcard to end of search dates
                # (indexed by YYYY-MM-DD; allow match on YYYY or YYYY-MM)
                if field == 'created':
                    search_val += '*'
                # add field/value search to the solr query
                q = q.query(**{solr_field: search_val})
                # add to search info for display to user
                field = 'collection' if field == 'coll' else field
                search_info.update({field: val})

        # search on all collected search terms
        q = q.query(*terms)
        # FIXME: there should be a way to exclude these by type
        # Exclude archival collection (Top-level library)
        for p in settings.PID_ALIASES.values():
            q = q.exclude(pid=p)

        # get a copy of current url options for pagination
        # and to generate links to remove active filters
        urlopts = request.GET.copy()

        # handle facets
        display_filters = []
        # - list of tuples: display name, link to remove the filter
        active_filters = dict((field, []) for field in
                              searchform.facet_field_names.iterkeys())
        # - dictionary of filters in use, for exclusion from displayed
        # facets

        # filter the solr search based on any facets in the request
        for filter_val, facet_field in searchform.facet_field_names.iteritems():
            # For multi-valued fields (author, subject), we could have multiple
            # filters on the same field; treat all facet fields as lists.
            for val in request.GET.getlist(filter_val):

                # ignore any facet if the value is not set
                if not val:
                    continue

                # special case: search for items without a field
                if val == missing_label:
                    q = q.exclude(**{'%s__any' % facet_field: True})

                else:
                    # filter the current solr query
                    q = q.filter(**{facet_field: val})

                # add to list of active filters
                active_filters[filter_val].append(val)

                # add to list for user display & removal
                # - copy the urlopts and remove only the current value
                unfacet_urlopts = urlopts.copy()
                val_list = unfacet_urlopts.getlist(filter_val)
                val_list.remove(val)
                unfacet_urlopts.setlist(filter_val, val_list)
                # tuple of filter display value, url to remove it
                # - add details to label when the value doesn't make it obvious
                if filter_val in ['added by', 'modified by']:
                    label = '%s %s' % (filter_val, val)
                elif filter_val == 'fixity_check':
                    label = 'fixity check: %s' % 'valid' if val == 'pass' else 'invalid'
                elif val == missing_label:
                    label = '%s: null' % filter_val
                elif filter_val == 'access status':
                    # use access status abbreviation instead of numeric code
                    label = rights_access_terms_dict[val].abbreviation
                else:
                    label = val

                display_filters.append((label,
                                        unfacet_urlopts.urlencode()))

        # Update solr query to return values & counts for the
        # configured facet fields
        q = q.facet_by(searchform.facet_field_names.values(),
                       mincount=1, limit=15, sort='count',
                       missing=True)
        # NOTE: missing true displays count for items without any value
        # for the facet field (e.g., no access code set)

        # if there are any *keyword* terms, sort by relevance and display score
        # (for fielded search terms, items will either match or not, so relevance
        # is not as useful)
        if terms:
            # NOTE: possibly a change in sunburnt?
            # including score now requires specifying *all* fields that
            # should be returned
            q = q.sort_by('-score').field_limit([
                # common item information
                "object_type", "content_model", "pid", "label", "title",
                "creator", "created", "last_modified", "added_by",
                # collection
                "archive_short_name", "hasMember",
                # item
                "collection_id",
                # audio
                "part", "collection_label", "duration", "has_access_copy",
                "access_copy_mimetype", "access_copy_size", "source_id",
                # arrangement/disk image
                "simpleCollection_label", "rights", "state",
                # migrated / original
                "original_pid", "isDerivationOf", "hasDerivation",
                # format and size, used for disk images display (at least)
                "content_size", "content_format"
                ],
                score=True)
            ctx['show_relevance'] = True
        # then sort by most recently created
        # (primary sort when no search terms, secondary otherwise)
        q = q.sort_by('-created')

        # list of currently known types for display in results
        # FIXME: are these used anywhere?
        known_object_types = ['audio', 'collection', 'born-digital']

        # paginate the solr result set
        paginator = Paginator(q, 30)
        try:
            page = int(request.GET.get('page', '1'))
        except ValueError:
            page = 1
        try:
            results = paginator.page(page)
        except (EmptyPage, InvalidPage):
            results = paginator.page(paginator.num_pages)
        # calculate page links to show
        show_pages = pages_to_show(paginator, page)

        # convert the facets from the solr result for display to user
        facets = SortedDict()
        facet_fields = results.object_list.facet_counts.facet_fields
        for display_name, field in searchform.facet_field_names.iteritems():
            #do not display coll facet because it is redundant with the collection facet
            if display_name in ['coll', 'fixity_check']:
                continue
            if field in facet_fields and facet_fields[field]:
                show_facets = []
                # skip any display facet values that are already in effect
                for val in facet_fields[field]:
                    try:
                        if val[0] not in active_filters[display_name]:
                            show_facets.append(val)
                    except TypeError:
                        # when solr missing=True is turned on,
                        # last result is a count of items with no value
                        # for this field
                        if val is not 0 and field in searchform.show_missing_facets \
                          and missing_label not in active_filters[display_name]:
                            show_facets.append((missing_label, val))
                if show_facets:
                    facets[display_name] = show_facets

        ctx.update({
            'page': results,
            'show_pages': show_pages,
            # 'known_types': known_object_types,
            'search_opts': request.GET.urlencode(),
            'search_terms': terms,
            'search_info': search_info,
            'url_params': urlopts.urlencode(),
            'facets': facets,
            'active_filters': display_filters,
        })

    return TemplateResponse(request, 'repoadmin/results.html', ctx)
Ejemplo n.º 32
0
    def handle(self, **options):
        verbosity = int(options['verbosity'])

        errors = 0

        solr = solr_interface()

        collections = open('collections.txt', 'r')
        gb = 1024*1024*1024
        with open('keep_collection_report.csv', 'wb') as file:
            writer = csv.writer(file, delimiter = ',')
            writer.writerow(['title', 'collection_code', 'library_name','size','object_count','dv_count','mov_count','mpg_count','ad1_count','aff_count','dd_count','e01_count','img_count','iso_count','tar_count','wav_count','status_code_2','status_code_3','status_code_4', 'status_code_5', 'status_code_10','status_code_11','status_code_12','status_code_13'])
            for line in collections:
                print line.strip()
                solrquery = solr.query().filter(title=str(line).strip()).sort_by('-created')
                for doc in solrquery:
                    try:
                        library_name = doc['isMemberOfCollection']
                        library_name = library_name.split('/')[1]
                        library_name = library_name.split("'")[0]
                    except:
                        library_name = ''

                solrquery = solr.query().filter(collection_label=str(line).strip())
                object_count = 0
                size = 0
                dv_count = 0
                mov_count = 0
                mpg_count = 0
                ad1_count = 0
                aff_count = 0
                dd_count = 0
                e01_count = 0
                img_count = 0
                iso_count = 0
                tar_count = 0
                wav_count = 0
                status_code_2 = 0
                status_code_3 = 0
                status_code_4 = 0
                status_code_5 = 0
                status_code_10 = 0
                status_code_11 = 0
                status_code_12 = 0
                status_code_13 = 0
                title = ''
                collection_code = 0
                for doc in solrquery:
                    object_count += 1
                    try:    
                        object_type = doc['object_type']
                    except:
                        object_type = ''

                    if object_type == 'audio':
                        wav_count = wav_count + 1
                        try:
                            size = int(doc['access_copy_size']) + size
                        except:
                            pass

                    
                    elif object_type == 'video':
                        try:
                            size = int(doc['content_size']) + size
                        except:
                            pass

                    elif object_type == 'disk image':
                        try:
                            size = int(doc['content_size']) + size
                        except:
                            pass

                    # collection code
                    try:
                        collection_code = doc['collection_source_id']
                    except:
                        collection_code = ''
                    
                    # access code counting
                    try:
                        access_code = int(doc['access_code'])
                        if access_code == 2:
                            status_code_2 += 1
                        elif access_code == 3:
                            status_code_3 += 1
                        elif access_code == 4:
                            status_code_4 += 1
                        elif access_code == 5:
                            status_code_5 += 1
                        elif access_code == 10:
                            status_code_10 += 1
                        elif access_code == 11:
                            status_code_11 += 1
                        elif access_code == 12:
                            status_code_12 += 1
                        elif access_code == 13:
                            status_code_13 += 1
                    except:
                        pass
                    # content format count       
                    try:
                        content_format = doc['content_format']
                        if content_format == 'AD1':
                            ad1_count += 1
                        elif content_format == 'AFF':
                            aff_count += 1
                        elif content_format == 'DD':
                            dd_count += 1
                        elif content_format == 'E01':
                            e01_count += 1
                        elif content_format == 'IMG':
                            img_count += 1
                        elif content_format == 'ISO':
                            iso_count += 1
                        elif content_format == 'TAR':
                            tar_count += 1
                        elif content_format == 'DV':
                            dv_count += 1
                        elif content_format == 'MOV':
                            mov_count += 1
                        elif content_format == 'MPG':
                            mpg_count += 1
                    except:
                        pass
                                   
                size = float(size) / gb
                writer.writerow([title, collection_code, library_name,size,object_count,dv_count,mov_count,mpg_count,ad1_count,aff_count,dd_count,e01_count,img_count,iso_count,tar_count,wav_count,status_code_2,status_code_3,status_code_4, status_code_5, status_code_10,status_code_11,status_code_12,status_code_13])
Ejemplo n.º 33
0
    def video(self):
        self.stderr.write('Video')
        ### video
        # need a representative sample of all mime types
        # representative sample of old dm and native Keep objects
        #   (dm carries an access status of 11)
        # representative sample of different access codes
        # 5-10 collections represented
        # about 40 objects total (can be smallest size objects)

        # NOTE: there is currently no easy way to ensure we have
        # a representative sample of all mimetypes (master mimetypes are
        # not indexed, and there is too much content, so it would be too
        # slow to look in fedora.  Hopefully the diversity of codes and
        # old dm content will provide sufficient representation.

        solr = solr_interface()
        # desired minimum number of collections
        # (minimum since more may be added in order to find
        # representative objects by status)
        num_collections = 5
        # desired number of objects
        desired_total = 40

        pids = []
        collections = set()

        # find all video, and sort smallest first
        all_video = solr.query(content_model=Video.VIDEO_CONTENT_MODEL) \
               .field_limit(['pid', 'collection_id']).sort_by('access_copy_size')
        # master size is not indexed, but hopefully access copy
        # can serve as a proxy
        total_pids = all_video.count()

        if self.verbosity >= self.v_normal:
            self.stderr.write('Found %d total video objects' % all_video.count())
        facet_q = all_video.facet_by('collection_id', sort='count', mincount=1) \
                           .facet_by('access_code', sort='count', mincount=1) \
                           .paginate(rows=0)
        facets = facet_q.execute().facet_counts.facet_fields

        # pick the requested number of collections with the most items
        top_collections = [pid for (pid, count) in facets['collection_id']][:num_collections]
        # restrict query to video in those collections
        collection_filter = solr.Q()
        for coll in top_collections:
            collection_filter |= solr.Q(collection_id=coll)
        q = all_video.filter(collection_filter)
        self.stderr.write('Found %d total video objects in %d largest collections' \
            % (q.count(), num_collections))

        # Nothing here ensures we get content from all of these
        # collections, but hopefully the diversity of status codes
        # will help provide a reasonable distribution.

        # figure out some representative percentage based on our desired total
        # - by far the most content is old dm (93%), so don't use that %
        # first facet is old dm (largest total); facet is label, count
        old_dm_code =  facets['access_code'][0][0]
        old_dm_total = facets['access_code'][0][1]

        # get percentages based on the total *without* old dm
        for code, num in facets['access_code'][1:]:
            # determine number of pids to grab as a percentage
            # of half the desired number
            percent = float(num) / (total_pids - old_dm_total)
            # minimum of at least 1 per code
            num_pids = max(int((percent) * (desired_total/2)), 1)
            if self.verbosity >= self.v_normal:
                self.stderr.write('  Looking for %d pid(s) for access code %s' % \
                    (num_pids, code))
            # first try to find within the request collections
            pids_by_code = q.filter(access_code=code)
            # if no pids are found for this code in our collections,
            # look for them elsewhere
            if not pids_by_code.count():
                pids_by_code = all_video.filter(access_code=code)
            for r in pids_by_code[:num_pids]:
                pids.append(r['pid'])
                collections.add(r['collection_id'])

        # other codes will provide slightly more than half,
        # because we are rounding up; get the rest of the
        # requested objects from old dm
        remainder = desired_total - len(pids)
        for r in q.filter(access_code=old_dm_code)[:remainder]:
            pids.append(r['pid'])
            collections.add(r['collection_id'])

        if self.verbosity >= self.v_normal:
            self.stderr.write('Selected %d pids from %d collections' % \
                    (len(pids), len(collections)))

        for p in pids:
            self.stdout.write(p)
Ejemplo n.º 34
0
    def audio(self):
        self.stderr.write('Audio')
        ### audio
        # representative sample of all mime types
        # representative sample of old dm and native Keep objects
        #   (dm carries an access status of 11)
        # representative sample of different access codes
        # 10 collections represented
        #   (please include material from Dawson (94jz3)

        # NOTE: this is largely the same logic as for video

        solr = solr_interface()

        # desired number of collections
        # (could be adjusted some since more may be added in order to
        # find representative objects by status)
        num_collections = 10
        # desired number of objects
        desired_total = 100

        pids = []
        collections = set()

        # find all audioo, and sort smallest first
        all_audio = solr.query(content_model=AudioObject.AUDIO_CONTENT_MODEL) \
               .field_limit(['pid', 'collection_id']).sort_by('access_copy_size')
        # master size is not indexed, but hopefully access copy
        # can serve as a proxy
        total_pids = all_audio.count()

        if self.verbosity >= self.v_normal:
            self.stderr.write('Found %d total audio objects' % all_audio.count())
        facet_q = all_audio.facet_by('collection_id', sort='count', mincount=1) \
                           .facet_by('access_code', sort='count', mincount=1) \
                           .paginate(rows=0)
        facets = facet_q.execute().facet_counts.facet_fields

        # pick the requested number of collections with the most items
        top_collections = [pid for (pid, count) in facets['collection_id']][:num_collections]
        # restrict query to video in those collections
        # OR in the dawson collection
        # (dawson is *probably* included in those, but explicitly include
        # since it was requested)
        collection_filter = solr.Q(collection_id=self.collections['dawson'])
        for coll in top_collections:
            collection_filter |= solr.Q(collection_id=coll)
        q = all_audio.filter(collection_filter)
        self.stderr.write('Found %d total audio objects in %d largest collections (including dawson)' \
            % (q.count(), num_collections))

        # Nothing here ensures we get content from all of these
        # collections, but hopefully the diversity of status codes
        # will help provide a reasonable distribution.

        # calculate and find a representative percentage of items
        # for each status based on the desired total
        for code, num in facets['access_code']:
            # determine number of pids to grab as a percentage
            # of the desired number
            percent = float(num) / total_pids
            # minimum of at least 1 per code
            num_pids = max(int(percent * desired_total), 1)
            if self.verbosity >= self.v_normal:
                self.stderr.write('  Looking for %d pid(s) for access code %s' % \
                    (num_pids, code))
            # first try to find within the request collections
            pids_by_code = q.filter(access_code=code)
            # if no pids are found for this code in our collections,
            # look for them elsewhere
            if not pids_by_code.count():
                pids_by_code = all_audio.filter(access_code=code)
            for r in pids_by_code[:num_pids]:
                pids.append(r['pid'])
                collections.add(r['collection_id'])

        if self.verbosity >= self.v_normal:
            self.stderr.write('Selected %d pids from %d collections' % \
                    (len(pids), len(collections)))

        for p in pids:
            self.stdout.write(p)
Ejemplo n.º 35
0
    def ingest_message(self, msg_data, mailbox, folder_order):

        # read content and redact IP addresses / email addresses
        msg_data = redact_email(msg_data)

        # generate email object from data
        email_msg = email.message_from_string(msg_data,
                                              _class=MacEncodedMessage)

        # check and warn if email has attachments
        attachments = self.email_attachments(email_msg)
        if attachments:
            print 'Warning! Email has attachments (not yet handled): %s' % \
                  ','.join(attachments)

        # get current content type to preserve the original value,
        # and also to determine how to decode
        content_type = email_msg.get('Content-Type', '')
        orig_content_type = email_msg.get_content_type()
        orig_content_charset = email_msg.get_content_charset()

        # at least one email in this set has a charset of 'unknown-8bit',
        # but the \xa0 in the content indicates it is probably latin 1
        if 'charset=unknown-8bit' in content_type:
            latin1_charset = email.charset.Charset('latin_1')
            email_msg.set_charset(latin1_charset)

        # otherwise, if charset is not set, assume mac roman
        elif not email_msg.get_charset():
            # tell email that charset should be mac roman,
            # so it can decode special characters
            mac_charset = email.charset.Charset('mac_roman')
            email_msg.set_charset(mac_charset)
            # decode headers from mac roman charset
            # (some messages contain improperly formatted
            # accented characters in a from/to header)
            email_msg.decode_headers()

        # create a new object to populate with data
        msg_obj = self.repo.get_object(type=EmailMessagePidReuse)

        # generate cerp from mime message
        # - store folder order as message local id
        msg_obj.cerp.content = cerp.Message.from_email_message(email_msg,
                                                               local_id=folder_order)

        # The generated CERP may have modified mac roman charset headers
        # which were needed to convert instead of the original;
        # update thex ml to store the original value,  NOT the encoding
        # that was used to decode the content.
        if content_type:
            if msg_obj.cerp.content.single_body:
                msg_obj.cerp.content.single_body.content_type_list[0] = orig_content_type
                msg_obj.cerp.content.single_body.charset_list[0] = orig_content_charset

        else:
            if msg_obj.cerp.content.single_body:
                del msg_obj.cerp.content.single_body.content_type_list[0]
                del msg_obj.cerp.content.single_body.charset_list[0]
        # loop through headers to set/remove content type
        for h in msg_obj.cerp.content.headers:
            if h.name == 'Content-Type':
                if content_type:
                    h.value = content_type
                else:
                    h.value = None
                    h.name = None
                break

        # construct an object label based on from/to/date/subject
        msg_from = email_msg['From']
        # NOTE: it would be nice to suppress redundant redaction email text here;
        # at least simplify label for rushdie, since that is what we'll see most
        if 'REDACTED: Salman Rushdie\'s email' in msg_from:
            msg_from = 'Salman Rushdie'
        label = u'Email from %s' %  msg_from
        if email_msg.get('To', None):
            # FIXME: could have multiple recipients
            # we *should* be able to get split-out version from email.Message ...
            to = email_msg['To']
            label += u' to %s' % email_msg['To']
        # date/subject not always present, but add if they are
        if email_msg.get('Date', None):
            label += u' on %s' % email_msg['Date']
        if email_msg.get('Subject', None):
            label += u' %s' % email_msg['Subject']

        # set as object label and dc:title
        msg_obj.label = label
        msg_obj.dc.content.title = label

        # in verbose noact mode, print label so user can see what is being done
        if self.verbosity > self.v_normal and self.noact:
            print label

        # generate a pristine email Message for saving fedora
        # (don't save modified charset, content type, etc.)
        msg_obj.mime_data.content = email.message_from_string(msg_data,
                                              _class=MacEncodedMessage)
        # calculate an MD5 of the email content *as it will be serialized*
        md5 = hashlib.md5()
        md5.update(str(msg_obj.mime_data.content))
        email_md5 = md5.hexdigest()
        msg_obj.mime_data.checksum = email_md5


        # check if this email has already been ingested via checksum;
        # don't re-ingest if it is already in the repository
        solr = solr_interface()
        q = solr.query(content_md5=msg_obj.mime_data.checksum).field_limit('pid')
        if len(q):
            if self.verbosity >= self.v_normal:
                print 'Email message has already been ingested as %s; skipping' \
                      % q[0]['pid']
            self.stats['previously_ingested'] += 1
            return


        # associate with current mailbox object
        msg_obj.mailbox = mailbox
        # belongs to same collection as its mailbox
        if mailbox.collection:
            msg_obj.collection = mailbox.collection
        # ingest items as accessioned/unprocessed
        msg_obj.arrangement_status = 'accessioned'
        # ingest with a default rights code of 10 "Undetermined" in rights DS
        msg_obj.rights.content.create_access_status()
        msg_obj.rights.content.access_status.code = "10"
        msg_obj.rights.content.access_status.text = rights_access_terms_dict["10"].text

        if not self.noact:
            try:
                msg_obj.save('ingesting email message from rushdie 5300c')
                if self.verbosity >= self.v_normal:
                    print 'Ingested message %s : %s' % \
                          (msg_obj.pid, msg_obj.label)
                    self.stats['ingested'] += 1
            except RequestFailed as rf:
                self.stats['ingest_error'] += 1
                print 'Error ingesting email message %s: %s' % \
                      (msg_obj.label, rf)
Ejemplo n.º 36
0
def dashboard(request):
    '''Admin dashboard page for staff users, with links to main
    functionality and date/month facets linking to searches for
    recently added or checksummed items.
    '''
    today = date.today()
    month_ago = today - timedelta(days=30)
    three_months = today - timedelta(days=31 * 3)

    solr = solr_interface()

    # search for all content added in the last month
    # and return just the facets for date created and collection name
    # - limit of 31 to ensure we get all dates in range
    facetq = solr.query().filter(created_date__range=(month_ago, today))  \
                .facet_by('created_date', sort='index',
                          limit=31, mincount=1) \
                .facet_by('collection_label_facet', sort='count',
                          limit=10, mincount=1) \
                .paginate(rows=0)
    # filter the facet query by user permissions
    # facetq = filter_by_perms(facetq, request.user)
    facets = facetq.execute().facet_counts.facet_fields

    # reverse order and convert to datetime.date for use with naturalday
    recent_items = []
    recent_dates = facets['created_date']
    recent_dates.reverse()
    # limit to just the 10 most recent dates
    for day, count in recent_dates[:10]:
        y, m, d = day.split('-')
        recent_items.append((date(int(y), int(m), int(d)), count))

    recent_collections = facets['collection_label_facet']

    # search for content added in the last few months
    # and return just the facets for year-month
    facetq = solr.query().filter(created_date__range=(three_months, today))  \
                .facet_by('created_month', sort='index',
                          mincount=1) \
                .paginate(rows=0)
    # also filter this query by user perms
    # facetq = filter_by_perms(facetq, request.user)
    recent_month_facet = facetq.execute().facet_counts.facet_fields['created_month']
    recent_month_facet.reverse()
    recent_months = []
    for month, count in recent_month_facet:
        y, m = month.split('-')
        recent_months.append((date(int(y), int(m), 1), count))

    # search for fixity checks in the last 30 days
    facetq = solr.query().filter(last_fixity_check__range=(month_ago, today))  \
                .facet_by('last_fixity_result', mincount=1) \
                .paginate(rows=0)
    # facetq = filter_by_perms(facetq, request.user)
    facets = facetq.execute().facet_counts.facet_fields
    recent_fixity_checks = facets['last_fixity_result']

    return TemplateResponse(request, 'repoadmin/site_dashboard.html',
        {'recent_items': recent_items, 'recent_months': recent_months,
        'recent_collections': recent_collections,
        'recent_fixity_checks': recent_fixity_checks,
        'month_ago': month_ago, 'manual_url': settings.KEEP_MANUAL_URL,
        'find_collection': FindCollection()})
Ejemplo n.º 37
0
def search(request):
    '''Search for :class:`~keep.audio.models.AudioObject` or
    :class:`~keep.arrangement.models.ArrangementObject`by pid, title,
    description, collection, date, rights, etc.'''

    # if NO search terms are specified, return an advanced search page
    if not request.GET:
        return TemplateResponse(request, 'common/advanced-search.html',
                      {'searchform': commonforms.ItemSearch(prefix='audio')})

    form = commonforms.ItemSearch(request.GET, prefix='audio')

    ctx_dict = {'searchform': form}
    if form.is_valid():
        solr = solr_interface()
        # solr search options from posted data
        search_opts = form.search_options()
        # search term/value display info for user based on posted data
        ctx_dict['search_info'] = form.search_info()

        # solr query to restrict this search to appropriate content models
        cm_query = solr.Q(solr.Q(content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) \
                          | solr.Q(content_model=AudioObject.AUDIO_CONTENT_MODEL)\
                          | solr.Q(content_model=Video.VIDEO_CONTENT_MODEL))
        # for now, sort by most recently created
        solrquery = solr.query(**search_opts).filter(cm_query).sort_by('-created')


        # if user requested specific display fields, handle output display and formatting
        if form.cleaned_data['display_fields']:
            fields = form.cleaned_data['display_fields']
            # pid and content model are always needed to construct html search results
            solr_fields = fields + ['pid', 'content_model']
            solrquery = solrquery.field_limit(solr_fields)

            class FieldList(list):
                # extended list  object with pid and content model attributes
                def __init__(self, pid=None, content_model=None, values=[]):
                    super(FieldList, self).__init__(values)
                    if pid:
                        self.pid = pid
                    if content_model:
                        self.content_model = content_model
                    else:
                        self.content_model = []

            def field_list(**kwargs):
                # method to construct a custom solr result based on the requested field list
                l = FieldList(pid=kwargs.get('pid', None),
                              content_model=kwargs.get('content_model', None))
                for f in fields:
                    val = kwargs.get(f, '')
                    if solr.schema.fields[f].multi_valued:
                        val = '; '.join(val)
                    l.append(val)
                return l

            solrquery = solrquery.results_as(field_list)

            ctx_dict.update({
                'display_fields': fields,
                'display_labels': [commonforms.ItemSearch.display_field_opts[f] for f in fields]
                })

            # if CSV is requested with display_fields, return as csv before paginating

            if form.cleaned_data['output'] == 'csv':
                response = HttpResponse(content_type='text/csv')
                response['Content-Disposition'] = 'attachment; filename=Keep-report_%s.csv' \
                                           % date.today()
                writer = unicodecsv.writer(response)
                # write out list of field labels
                writer.writerow(ctx_dict['display_labels'])
                # then append all matching values
                # FIXME: csv output for very large results is VERY slow
                # TODO: append rows in chunks of 50-100, to handle
                # large result sets better - maybe use paginator?
                writer.writerows(solrquery)
                return response


        paginator = Paginator(solrquery, 30)
        try:
            page = int(request.GET.get('page', '1'))
        except ValueError:
            page = 1
        try:
            results = paginator.page(page)
        except (EmptyPage, InvalidPage):
            results = paginator.page(paginator.num_pages)

        # calculate page links to show
        show_pages = pages_to_show(paginator, page)

        ctx_dict.update({
            'results': results.object_list,
            'page': results,
            'show_pages': show_pages,
            # pass search term query opts to view for pagination links
            'search_opts': request.GET.urlencode(),
        })

    return TemplateResponse(request, 'common/search.html', ctx_dict)
Ejemplo n.º 38
0
    def handle(self, **options):
        verbosity = int(options['verbosity'])

        errors = 0

        solr = solr_interface()

        with open('collection_report.csv', 'wb') as file:
            writer = csv.writer(file, delimiter = ',')
            writer.writerow(['ark_uri', 'object_type','pid','duration','content_model','has_original','title','content_size','researcher_access','label','content_format','state','collection_source_id','type','original_pid','access_copy_mimetype','access_code','collection_id', 'collection_label', 'isMemberOfCollection','rights','created_year','has_access_copy','access_copy_size'])

            solrquery = solr.query().sort_by('-created')
            for doc in solrquery:
                try:
                    ark_uri = doc['ark_uri']
                except:
                    ark_uri = ''
                try:    
                    object_type = doc['object_type']
                except:
                    object_type = ''
                try:
                    pid = doc['pid']
                except:
                    pid = ''
                try:
                    duration = doc['duration']
                except:
                    duration = ''
                try:
                    content_model = doc['content_model']
                except:
                    content_model = ''
                try:
                    has_original = doc['has_original']
                except:
                    has_original = ''
                try:
                    title = doc['title']
                except:
                    title = ''
                try:
                    content_size = doc['content_size']
                except:
                    content_size = ''
                try:    
                    researcher_access = doc['researcher_access']
                except:
                    researcher_access = ''
                try:
                    label = doc['label']
                except:
                    label = ''
                try:
                    content_format = doc['content_format']
                except:
                    content_format = ''
                try:
                    state = doc['state']
                except:
                    state = ''
                try:
                    collection_source_id = doc['collection_source_id']
                except:
                    collection_source_id = ''
                try:
                    my_type = doc['type']
                except:
                    my_type = ''
                try:
                    original_pid = doc['original_pid']
                except:
                    original_pid = ''
                try:
                    access_copy_mimetype = doc['access_copy_mimetype']
                except:
                    access_copy_mimetype = ''
                try:
                    access_code = doc['access_code']
                except:
                    access_code = ''
                try:
                    collection_id = doc['collection_id']
                except:
                    collection_id = ''
                try:
                    collection_label = doc['collection_label']
                except:
                    collection_label = ''
                try:
                    ismemberofcollection = doc['isMemberOfCollection']
                except:
                    ismemberofcollection = ''
                try:
                    rights = doc['rights']
                except:
                    rights = ''
                try:
                    created_year = doc['created_year']
                except:
                    created_year = ''
                try:
                    has_access_copy = doc['has_access_copy']
                except:
                    has_access_copy = ''
                try:
                    access_copy_size = doc['access_copy_size']
                except:
                    access_copy_size = ''

                items = [ark_uri, object_type, pid, duration, content_model, has_original, title, content_size, researcher_access, label, content_format, state, collection_source_id, my_type, original_pid, access_copy_mimetype, access_code, collection_id, collection_label, ismemberofcollection, rights, created_year, has_access_copy, access_copy_size]
                array = []
                for item in items:
                    if isinstance(item, basestring):
                        array.append(item.encode('utf-8'))
                    else:
                        array.append(item)
                writer.writerow(array)
Ejemplo n.º 39
0
    def test_search_bydate(self, mockpaginator, mocksolr_interface,
                           mocksearch_libs):
        solr = solr_interface()
        search_url = reverse('search:keyword')
        mocksolr = mocksolr_interface.return_value
        mocksolr.Q = MagicMock(solr.Q)

        mocksolr.query.return_value = mocksolr.query
        for method in [
                'query', 'facet_by', 'sort_by', 'field_limit', 'exclude',
                'filter'
        ]:
            getattr(mocksolr.query, method).return_value = mocksolr.query

        # create researcher IP for localhost so anonymous access will be
        # treated as anonymous researcher
        researchip = ResearcherIP(name='test client', ip_address='127.0.0.1')
        researchip.save()

        # start date only
        sdate = '1980'
        response = self.client.get(search_url, {'start_date': sdate})
        # check solr query args
        # - date should query dates created and issued explicitly
        mocksolr.Q.assert_any_call(date_created__gte=sdate)
        mocksolr.Q.assert_any_call(date_issued__gte=sdate)

        self.assertContains(
            response,
            '<input class="form-control" id="id_start_date" name="start_date" placeholder="Start year" type="tel" value="%s">'
            % sdate,
            html=True,
            msg_prefix=
            'start date search value should be displayed on result page via form'
        )

        # end date only
        edate = '2001'
        response = self.client.get(search_url, {'end_date': edate})
        # check solr query args
        # - date should query dates created and issued explicitly
        search_edate = '%s-12-31' % edate
        mocksolr.Q.assert_any_call(date_created__lte=search_edate)
        mocksolr.Q.assert_any_call(date_issued__lte=search_edate)

        self.assertContains(
            response,
            '<input class="form-control" id="id_end_date" name="end_date" placeholder="End year" type="tel" value="%s">'
            % edate,
            html=True,
            msg_prefix=
            'start date search value should be displayed on result page via form'
        )

        # start and end date together
        response = self.client.get(search_url, {
            'start_date': sdate,
            'end_date': edate
        })
        # check solr query args
        # - date should query dates created and issued explicitly
        mocksolr.Q.assert_any_call(date_created__range=(sdate, search_edate))
        mocksolr.Q.assert_any_call(date_issued__range=(sdate, search_edate))

        researchip.delete()
Ejemplo n.º 40
0
    def audio(self):
        self.stderr.write('Audio')
        ### audio
        # representative sample of all mime types
        # representative sample of old dm and native Keep objects
        #   (dm carries an access status of 11)
        # representative sample of different access codes
        # 10 collections represented
        #   (please include material from Dawson (94jz3)

        # NOTE: this is largely the same logic as for video

        solr = solr_interface()

        # desired number of collections
        # (could be adjusted some since more may be added in order to
        # find representative objects by status)
        num_collections = 10
        # desired number of objects
        desired_total = 100

        pids = []
        collections = set()

        # find all audioo, and sort smallest first
        all_audio = solr.query(content_model=AudioObject.AUDIO_CONTENT_MODEL) \
               .field_limit(['pid', 'collection_id']).sort_by('access_copy_size')
        # master size is not indexed, but hopefully access copy
        # can serve as a proxy
        total_pids = all_audio.count()

        if self.verbosity >= self.v_normal:
            self.stderr.write('Found %d total audio objects' %
                              all_audio.count())
        facet_q = all_audio.facet_by('collection_id', sort='count', mincount=1) \
                           .facet_by('access_code', sort='count', mincount=1) \
                           .paginate(rows=0)
        facets = facet_q.execute().facet_counts.facet_fields

        # pick the requested number of collections with the most items
        top_collections = [pid for (pid, count) in facets['collection_id']
                           ][:num_collections]
        # restrict query to video in those collections
        # OR in the dawson collection
        # (dawson is *probably* included in those, but explicitly include
        # since it was requested)
        collection_filter = solr.Q(collection_id=self.collections['dawson'])
        for coll in top_collections:
            collection_filter |= solr.Q(collection_id=coll)
        q = all_audio.filter(collection_filter)
        self.stderr.write('Found %d total audio objects in %d largest collections (including dawson)' \
            % (q.count(), num_collections))

        # Nothing here ensures we get content from all of these
        # collections, but hopefully the diversity of status codes
        # will help provide a reasonable distribution.

        # calculate and find a representative percentage of items
        # for each status based on the desired total
        for code, num in facets['access_code']:
            # determine number of pids to grab as a percentage
            # of the desired number
            percent = float(num) / total_pids
            # minimum of at least 1 per code
            num_pids = max(int(percent * desired_total), 1)
            if self.verbosity >= self.v_normal:
                self.stderr.write('  Looking for %d pid(s) for access code %s' % \
                    (num_pids, code))
            # first try to find within the request collections
            pids_by_code = q.filter(access_code=code)
            # if no pids are found for this code in our collections,
            # look for them elsewhere
            if not pids_by_code.count():
                pids_by_code = all_audio.filter(access_code=code)
            for r in pids_by_code[:num_pids]:
                pids.append(r['pid'])
                collections.add(r['collection_id'])

        if self.verbosity >= self.v_normal:
            self.stderr.write('Selected %d pids from %d collections' % \
                    (len(pids), len(collections)))

        for p in pids:
            self.stdout.write(p)
Ejemplo n.º 41
0
    def video(self):
        self.stderr.write('Video')
        ### video
        # need a representative sample of all mime types
        # representative sample of old dm and native Keep objects
        #   (dm carries an access status of 11)
        # representative sample of different access codes
        # 5-10 collections represented
        # about 40 objects total (can be smallest size objects)

        # NOTE: there is currently no easy way to ensure we have
        # a representative sample of all mimetypes (master mimetypes are
        # not indexed, and there is too much content, so it would be too
        # slow to look in fedora.  Hopefully the diversity of codes and
        # old dm content will provide sufficient representation.

        solr = solr_interface()
        # desired minimum number of collections
        # (minimum since more may be added in order to find
        # representative objects by status)
        num_collections = 5
        # desired number of objects
        desired_total = 40

        pids = []
        collections = set()

        # find all video, and sort smallest first
        all_video = solr.query(content_model=Video.VIDEO_CONTENT_MODEL) \
               .field_limit(['pid', 'collection_id']).sort_by('access_copy_size')
        # master size is not indexed, but hopefully access copy
        # can serve as a proxy
        total_pids = all_video.count()

        if self.verbosity >= self.v_normal:
            self.stderr.write('Found %d total video objects' %
                              all_video.count())
        facet_q = all_video.facet_by('collection_id', sort='count', mincount=1) \
                           .facet_by('access_code', sort='count', mincount=1) \
                           .paginate(rows=0)
        facets = facet_q.execute().facet_counts.facet_fields

        # pick the requested number of collections with the most items
        top_collections = [pid for (pid, count) in facets['collection_id']
                           ][:num_collections]
        # restrict query to video in those collections
        collection_filter = solr.Q()
        for coll in top_collections:
            collection_filter |= solr.Q(collection_id=coll)
        q = all_video.filter(collection_filter)
        self.stderr.write('Found %d total video objects in %d largest collections' \
            % (q.count(), num_collections))

        # Nothing here ensures we get content from all of these
        # collections, but hopefully the diversity of status codes
        # will help provide a reasonable distribution.

        # figure out some representative percentage based on our desired total
        # - by far the most content is old dm (93%), so don't use that %
        # first facet is old dm (largest total); facet is label, count
        old_dm_code = facets['access_code'][0][0]
        old_dm_total = facets['access_code'][0][1]

        # get percentages based on the total *without* old dm
        for code, num in facets['access_code'][1:]:
            # determine number of pids to grab as a percentage
            # of half the desired number
            percent = float(num) / (total_pids - old_dm_total)
            # minimum of at least 1 per code
            num_pids = max(int((percent) * (desired_total / 2)), 1)
            if self.verbosity >= self.v_normal:
                self.stderr.write('  Looking for %d pid(s) for access code %s' % \
                    (num_pids, code))
            # first try to find within the request collections
            pids_by_code = q.filter(access_code=code)
            # if no pids are found for this code in our collections,
            # look for them elsewhere
            if not pids_by_code.count():
                pids_by_code = all_video.filter(access_code=code)
            for r in pids_by_code[:num_pids]:
                pids.append(r['pid'])
                collections.add(r['collection_id'])

        # other codes will provide slightly more than half,
        # because we are rounding up; get the rest of the
        # requested objects from old dm
        remainder = desired_total - len(pids)
        for r in q.filter(access_code=old_dm_code)[:remainder]:
            pids.append(r['pid'])
            collections.add(r['collection_id'])

        if self.verbosity >= self.v_normal:
            self.stderr.write('Selected %d pids from %d collections' % \
                    (len(pids), len(collections)))

        for p in pids:
            self.stdout.write(p)
Ejemplo n.º 42
0
 def solr_items_query(self):
     'Solr query for all items in this collection'
     solr = solr_interface()
     # search for all items that belong to this collection
     return solr.query(collection_id=self.pid)
Ejemplo n.º 43
0
def keyword_search_suggest(request):
    '''Suggest helper for keyword search.  If the search string ends
    with a recognized field name with an optional value,
    e.g. ``user:`` or ``user:A``, looks up existing values using Solr
    facets.  Returns a JSON response with the 15 most common matching
    terms in the requested field with the search term prefix, if any.
    If the search string is empty or ends with a space, suggests
    available search fields with an explanation.

    .. Note::

        Due to the current implementation and the limitations of facet
        querying in Solr, the search term is case-sensitive and only
        matches at the beginning of the string.

    Return format is suitable for use with `JQuery UI Autocomplete`_
    widget.

    .. _JQuery UI Autocomplete: http://jqueryui.com/demos/autocomplete/

    :param request: the http request passed to the original view
        method (used to retrieve the search term)
    '''
    term = request.GET.get('term', '')

    suggestions = []

    # if term is empty or ends in a space, suggest available search fields
    if term == '' or term[-1] == ' ':
        suggestions = [
            {'label': field,
             'value': '%s%s' % (term, field),
             'category': 'Search Fields',
             'desc': desc}
            for field, desc in KeywordSearch.field_descriptions.iteritems()
        ]

    # otherwise, check if there is a field to look up values for
    else:

        term_prefix, sep, term_suffix = term.rpartition(' ')
        value_prefix = term_prefix + sep
        # parse the last search term
        try:
            # parse could error in some cases
            parsed_terms = parse_search_terms(term_suffix)
            field, prefix = parsed_terms[-1]
        except Exception:
            field, prefix = None, ''

        if prefix is None:
            prefix = ''

        # if field can be faceted, suggest terms
        if field in KeywordSearch.facet_fields.keys():
            facet_field = KeywordSearch.facet_fields[field]

            # date created is a special case
            if field == 'created':
                sort = 'index'
                category = 'Date Added'

                # if less than 4 characters, suggest year
                if len(prefix) < 4:
                    facet_field = 'created_year'
                    result_fmt = '%s'
                # between 4 and 7, suggest year-month
                elif len(prefix) < 7:
                    facet_field = 'created_month'
                    result_fmt = '%s'
                # suggest full dates
                else:
                    result_fmt = '%s '

            elif field in ['added_by', 'user']:  # added_by or user
                sort = 'count'
                category = 'Users'
                result_fmt = '"%s" '


            # collection label
            if field == 'coll':
                sort = 'count'
                category = 'Collection'
                result_fmt = '%s '

                # if the term is numeric facet by source_id
                if prefix and prefix.isdigit():
                    facet_field = 'collection_source_id'

            solr = solr_interface()
            facetq = solr.query().paginate(rows=0)
            # filter by current user permssions
            # facetq = filter_by_perms(facetq, request.user)
            # return the 15 most common terms in the requested facet field
            # with a specified prefix
            facetq = facetq.facet_by(facet_field, prefix=prefix,
                                     sort=sort, limit=15)
            facets = facetq.execute().facet_counts.facet_fields

            # generate a dictionary to return via json with label (facet value
            # + count), and actual value to use
            suggestions = [{'label': '%s (%d)' % (facet, count),
                            'value': '%s%s:' % (value_prefix, field) + \
                                            result_fmt % facet,
                            'category': category}
                           for facet, count in facets[facet_field]
                           ]

    return HttpResponse(json_serializer.encode(suggestions),
                         content_type='application/json')
Ejemplo n.º 44
0
def search(request):
    form = SearchForm(request.GET, user=request.user)
    # form.filter_libraries_by_user(request.user)
    ctx = {'form': form}
    if form.is_valid():
        search_terms = form.cleaned_data['keyword']
        search_opts = form.cleaned_data
        # solr search field parses into list of tuples of field, search terms
        # this search doesn't support any field: searching yet, so just assume all are keywords
        search_terms = [v for k, v in search_terms]

        solr = solr_interface()

        # NOTE: content type currently supported for researcher access
        cm_query = solr.Q(solr.Q(content_model=AudioObject.AUDIO_CONTENT_MODEL) |
                          solr.Q(content_model=Video.VIDEO_CONTENT_MODEL))

        # start with a default query to add filters & search terms
        q = solr.query().filter(cm_query)

        # filter the query by logged-in user permissions
        # includes restricting to researcher-accessible content when appropriate
        q = filter_by_perms(q, request.user)

        if search_terms:
            q = q.query(*search_terms)
            # NOTE: sunburnt now seems to require explicit list of fields
            # needed when returning score
            q = q.sort_by('-score').field_limit(['pid', 'title', 'collection_id',
                'collection_source_id', 'collection_label', 'ark_uri',
                'date_issued', 'date_created', 'part', 'duration',
                'researcher_access', 'object_type'],
                score=True)
            # NOTE: do we want a secondary sort after score?
        else:
            q = q.sort_by('title_exact')

        # if a collection search term is specified, filter
        if 'collection' in search_opts and search_opts['collection']:
            collection = search_opts['collection']
            # search on *either* collection name or collection number
            q = q.query(solr.Q(collection_label=collection) | solr.Q(collection_source_id=collection))

        # if a library is specified, filter by archive id on related collection
        if 'library' in search_opts and search_opts['library']:
            library = search_opts['library']
            # NOTE: requires a join query; items belong to collections, which belong
            # to libraries; join on pid->collection id in order to filter on
            # archive id property on the associated collection object
            q = q.join('pid', 'collection_id', archive_id=library)

        # if format search term is specified, filter
        if 'format' in search_opts and search_opts['format']:
            format = search_opts['format']
            # search on format by content model
            q = q.query(solr.Q(content_model=format))

        # date search
        if search_opts.get('start_date', None) or search_opts.get('end_date', None):
            sdate = search_opts.get('start_date', None)
            edate = search_opts.get('end_date', None)
            # NOTE: needs to handle date format variation (YYYY, YYYY-MM, etc)

            if sdate is not None:
                # ensure we search on 4-digit year
                sdate = '%04d' % int(sdate)

            # convert end date to end of year in order to catch any date variants
            # within that year; e.g. 2001-12-31 will always come after 2001-04, etc
            if edate is not None:
                edate = "%04d-12-31" % int(edate)

            # single date search: start and end date should be the same;
            # using same logic as range to match any dates within that year
            # if only one of start or end is specified, results in an open range
            # i.e. anything after start date or anything before end date

            # if both values are set, use sunburnt range query
            if sdate is not None and edate is not None:
                created_q = solr.Q(date_created__range=(sdate, edate))
                issued_q = solr.Q(date_issued__range=(sdate, edate))
                # q = q.query(date__range=(sdate, edate))
            elif sdate is not None:
                # restrict by start date
                # YYYY will be before any date in that year, e.g. "2001" >= "2001-11"
                # q = q.query(date__gte='%04d' % sdate)
                created_q = solr.Q(date_created__gte=sdate)
                issued_q = solr.Q(date_issued__gte=sdate)
            elif edate is not None:
                # restrict by end date
                # q = q.query(date__lte=str(edate))
                created_q = solr.Q(date_created__lte=edate)
                issued_q = solr.Q(date_issued__lte=edate)

            # NOTE: explicitly search on date created or date issued,
            # to avoid complications with other values in the generic date field
            q = q.query(created_q | issued_q)

        # paginate the solr result set
        paginator = Paginator(q, 30)
        try:
            page = int(request.GET.get('page', '1'))
        except ValueError:
            page = 1
        try:
            results = paginator.page(page)
        except (EmptyPage, InvalidPage):
            results = paginator.page(paginator.num_pages)

        # url parameters for pagination links
        url_params = request.GET.copy()
        if 'page' in url_params:
            del url_params['page']

        ctx.update({
            'results': results,
            'search_opts': request.GET.urlencode(),
            'search_terms': search_terms,
            'url_params': urlencode(url_params)
        })

    return TemplateResponse(request, 'search/results.html', ctx)
Ejemplo n.º 45
0
def list_archives(request, archive=None):
    '''List all top-level archive collections, with the total count of
    :class:`~keep.collection.models.CollectionObject` in each archive.

    .. Note::

       Archives must be configured in **PID_ALIASES** in Django settings
       in order to be listed here.

    .. Note::

       Within the code, top-level collections are referred to as "archives",
       but externally for users they should always be labeled as "Libraries."

    '''

    # if params are set, search for collection
    if 'archive' in request.GET and 'collection' in request.GET:
        form = FindCollection(request.GET, user=request.user)
        if form.is_valid():
            data = form.cleaned_data
            q = CollectionObject.item_collection_query()
            # submitted value is pid alias; lookup pid for solr query
            archive_id = settings.PID_ALIASES[data['archive']]
            q = q.query(archive_id=archive_id,
                        source_id=data['collection'])
            # if exactly one result is found, redirect to the collection view
            if q.count() == 1:
                # give user some context for the redirect
                messages.info(request, 'One collection found for %s %s.' %
                              (data['archive'].upper(), data['collection']))
                return HttpResponseSeeOtherRedirect(reverse('collection:view',
                    kwargs={'pid': q[0]['pid']}))

            # otherwise, if multiple, redirect to a filtered view of the archive browse
            elif q.count():
                messages.info(request, '%d collections found for %s %s.' %
                    (q.count(), data['archive'].upper(), data['collection']))
                return HttpResponseSeeOtherRedirect('%s?%s' % \
                    (reverse('collection:browse-archive',
                             kwargs={'archive': data['archive']}),
                    urlencode({'collection': data['collection']})))

            # if no matches, warn and return to archive display
            else:
                messages.warning(request, 'No collections found for %s %s.' %
                              (data['archive'].upper(), data['collection']))

        # values submitted but form not valid
        else:
            # TODO: better error message?
            messages.warning(request, 'Collection search input was not valid; please try again.')


    q = CollectionObject.item_collection_query()
    q = q.facet_by('archive_id', sort='count', mincount=1) \
         .paginate(rows=0)

     # - depending on permissions, restrict to collections with researcher audio
    if not request.user.has_perm('collection.view_collection') and \
           request.user.has_perm('collection.view_researcher_collection'):
        q = q.join('collection_id', 'pid', researcher_access=True)
        q = q.join('collection_id', 'pid', has_access_copy=True)

    facets = q.execute().facet_counts.facet_fields

    solr = solr_interface()
    archive_info = dict([(pid.replace('info:fedora/', ''), {'count': count})
                        for pid, count in facets['archive_id']])

    # construct a boolean pid query to match any archive pids
    # in order to lookup titles and match them to pids
    pid_q = solr.Q()
    for pid in archive_info.keys():
        pid_q |= solr.Q(pid=pid)
    query = solr.query(pid_q) \
                .field_limit(['pid', 'title']) \
                .sort_by('title')

    # pid aliases are keyed on the alias, but we need to look up by pid
    pid_aliases_by_pid = dict([(v, k) for k, v in settings.PID_ALIASES.iteritems()])

    # add solr information and pid aliases to info dictionary
    for q in query:
        pid = q['pid']
        if pid not in archive_info:
            continue
        # duplicate to make list of dict available to template for dictsort
        archive_info[pid]['pid'] = q['pid']
        archive_info[pid]['title'] = q['title']
        alias = pid_aliases_by_pid.get(pid, None)
        archive_info[pid]['alias'] = alias
        if alias is None:
            logger.warning('No pid alias found for archive %(pid)s (%(title)s)' \
                           % q)

    # prune any referenced archives that aren't actually indexed in solr
    # (should only happen in dev/qa)
    for pid in archive_info.keys():
        if 'title' not in archive_info[pid] or archive_info[pid]['alias'] is None:
            del archive_info[pid]

    # NOTE: sending list of values (dictionaries) to allow sorting in template

    return TemplateResponse(request, 'collection/archives.html',
        {'archives': archive_info.values(), 'find_collection': FindCollection(user=request.user)})