def collection_suggest(request): '''Suggest view for collections, for use with use with `JQuery UI Autocomplete`_ widget. Searches for collections on all of the terms passed in (as multiple keywords), similar to the way the combined search works. .. _JQuery UI Autocomplete: http://jqueryui.com/demos/autocomplete/ :param request: the http request passed to the original view method (used to retrieve the search term) ''' term = request.GET.get('term', '') suggestions = [] if term: # If the search term doesn't end in space, add a wildcard to # the last word to allow for partial word matching. if term[-1] != ' ': term += '*' terms = search_terms(term) solr = solr_interface() # common query parameters and options base_query = solr.query() \ .filter(content_model=CollectionObject.COLLECTION_CONTENT_MODEL) \ .field_limit(['pid', 'source_id', 'title', 'archive_short_name', 'creator', 'archive_id']) \ .sort_by('-score') q = base_query.query(terms) # NOTE: there seems to be a Lucene/Solr bug/quirk where adding # a wildcard at the end of a word causes Solr not to match the # exact word (even though docs indicate this should work). # As a work-around, if we added a * and got 0 results, # try the search again without the wildcard. if term[-1] == '*' and q.count() == 0: q = base_query.query(search_terms(term[:-1])) #Exclude archival collection (Top-level library) q=q.filter(archive_id__any=True) suggestions = [{'label': '%s %s' % (c.get('source_id', ''), c.get('title', '(no title')), 'value': c['pid'], # FIXME: do we need URI here? 'category':c.get('archive_short_name', ''), 'desc': c.get('creator', '')} for c in q[:15]] return HttpResponse(json_serializer.encode(suggestions), content_type='application/json')
def clean_keywords(self): data = self.cleaned_data['keywords'] # doesn't care about mis-matched quotes, just strips them out terms = search_terms(data) for t in terms: if t.startswith('*') or t.startswith('?'): raise forms.ValidationError( mark_safe('Search terms may not begin with wildcards <b>*</b> or <b>?</b>'), code='invalid') # NOTE: this cleans up mismatched quotes and converts them to terms data = ' '.join('"%s"' % t if ' ' in t else t for t in terms) return data
def search_terms(self): '''Get a list of keywords and phrases from the keyword input field, using :meth:`eulcommon.searchutil.search_terms`. Assumes that the form has already been validated and cleaned_data is available.''' # get list of keywords and phrases keywords = self.cleaned_data['keyword'] # NOTE: currently using searchutil.search_terms to separate out # single words and exact phrases. Because it also looks for # fielded search terms (like title:something or title:"another thing") # it can't handle searching on an ARK URI. As a workaround, # encode known colons that should be preserved before running # search terms, and then restore them after. keywords = re.sub(r'(http|ark):', r'\1;;', keywords) return [re.sub(r'(http|ark);;', r'\1:', term) for term in search_terms(keywords)]
def test_phrases(self): # quoted phrases self.assertEqual(['exact phrase'], search_terms('"exact phrase"')) self.assertEqual(["'single", "quotes'"], search_terms("'single quotes'")) self.assertEqual(['exact phrase', 'with', 'keyword'], search_terms('"exact phrase" with keyword')) # phrase with internal apostrophe self.assertEqual(["I don't", "know"], search_terms('"I don\'t" know')) # non-matching quotes ignored self.assertEqual(["non", "phrase'"], search_terms('"non phrase\'')) self.assertEqual(["'hello'"], search_terms('"\'hello\'"')) self.assertEqual(["'Tis a beautiful day"], search_terms('"\'Tis a beautiful day"'))
def test_wildcards(self): # beginning of a word self.assertEqual(['*nd', 'to', 'mouth'], search_terms('*nd to mouth')) self.assertEqual(['?nd', 'or'], search_terms(' ?nd or')) # middle of a word self.assertEqual(['w*ther', 'or', 'not'], search_terms('w*ther or not')) self.assertEqual(['wh?ther', 'thou', 'goest'], search_terms('wh?ther thou goest')) # end of a word self.assertEqual(['th*'], search_terms('th*')) self.assertEqual(['th?'], search_terms('th?'))
def test_words(self): # search strings with single words self.assertEqual(['word'], search_terms('word')) self.assertEqual(['multiple', 'words'], search_terms('multiple words')) self.assertEqual(["don't"], search_terms("don't")) self.assertEqual(['one', '2.5'], search_terms('one 2.5')) self.assertEqual(['extraneous', 'whitespace'], search_terms(' extraneous whitespace ')) # search_terms should ignore colons self.assertEqual(['one', 'two:', 'three'], search_terms(' one two: three')) # search_terms should ignore colons self.assertEqual(['one', 'two:three', 'four'], search_terms(' one two:three four')) self.assertEqual(['one', 'two:"three\tfour"', 'five'], search_terms(' one two:"three\tfour" five'))
def search_terms(self): '''Get a list of keywords and phrases from the keyword input field, using :meth:`eulcommon.searchutil.search_terms`. Assumes that the form has already been validated and cleaned_data is available.''' # get list of keywords and phrases keywords = self.cleaned_data['keyword'] # NOTE: currently using searchutil.search_terms to separate out # single words and exact phrases. Because it also looks for # fielded search terms (like title:something or title:"another thing") # it can't handle searching on an ARK URI. As a workaround, # encode known colons that should be preserved before running # search terms, and then restore them after. keywords = re.sub(r'(http|ark):', r'\1;;', keywords) return [ re.sub(r'(http|ark);;', r'\1:', term) for term in search_terms(keywords) ]