def _build_es_excerpt(result): """Return concatenated search excerpts. :arg result: The result object from the queryset results """ excerpt = EXCERPT_JOINER.join([m.strip() for m in chain(*result._highlighted.values()) if m]) return jinja2.Markup(clean_excerpt(excerpt))
def test_utf8_excerpt(self): """Characters should stay in UTF-8.""" q = u'fa\xe7on' ws = (wiki_searcher().highlight('html').query(u'fa\xe7on').values_dict( 'html')) results = list(ws) # page = Document.objects.get(pk=4) excerpt = clean_excerpt(ws.excerpt(results[0])[0][0]) assert q in excerpt, u'%s not in %s' % (q, excerpt)
def test_utf8_excerpt(self): """Characters should stay in UTF-8.""" q = u'fa\xe7on' ws = (wiki_search.highlight('html') .query(u'fa\xe7on') .values_dict('html')) results = list(ws) # page = Document.objects.get(pk=4) excerpt = clean_excerpt(ws.excerpt(results[0])[0]) assert q in excerpt, u'%s not in %s' % (q, excerpt)
def _build_es_excerpt(result): """Return concatenated search excerpts. :arg result: The result object from the queryset results """ excerpt = EXCERPT_JOINER.join( [m.strip() for m in chain(*result._highlight.values()) if m]) return jinja2.Markup(clean_excerpt(excerpt))
def _build_excerpt(searcher, model_obj): """Return concatenated search excerpts for Sphinx. :arg searcher: The ``S`` object that did the search :arg model_obj: The model object returned by the search """ try: excerpt = EXCERPT_JOINER.join([m.strip() for m in chain(*searcher.excerpt(model_obj)) if m]) except ExcerptTimeoutError: statsd.incr("search.excerpt.timeout") excerpt = u"" except ExcerptSocketError: statsd.incr("search.excerpt.socketerror") excerpt = u"" return jinja2.Markup(clean_excerpt(excerpt))
def _search_suggestions(request, text, locale, tags, product_slugs): """Return an iterable of the most relevant wiki pages and questions. :arg text: full text to search on :arg locale: locale to limit to :arg tags: list of tags to filter questions on :arg product_slugs: list of product slugs to filter articles on (["desktop", "mobile", ...]) Items are dicts of:: { 'type': 'search_summary': 'title': 'url': 'object': } :returns: up to 3 wiki pages, then up to 3 questions. """ # TODO: this can be reworked to pull data from ES rather than # hit the db. question_s = Question.search() wiki_s = Document.search() # Max number of search results per type. WIKI_RESULTS = QUESTIONS_RESULTS = 3 default_categories = settings.SEARCH_DEFAULT_CATEGORIES # Apply product filters if product_slugs: wiki_s = wiki_s.filter(document_product__in=product_slugs) if tags: question_s = question_s.filter(question_tag__in=tags) results = [] try: query = dict(('%s__text' % field, text) for field in Document.get_query_fields()) raw_results = ( wiki_s.filter(document_locale=locale, document_category__in=default_categories) .query(or_=query) .values_dict('id')[:WIKI_RESULTS]) for r in raw_results: try: doc = (Document.objects.select_related('current_revision') .get(pk=r['id'])) results.append({ 'search_summary': clean_excerpt( doc.current_revision.summary), 'url': doc.get_absolute_url(), 'title': doc.title, 'type': 'document', 'object': doc, }) except Document.DoesNotExist: pass # Note: Questions app is en-US only. query = dict(('%s__text' % field, text) for field in Question.get_query_fields()) raw_results = (question_s.query(or_=query) .values_dict('id')[:QUESTIONS_RESULTS]) for r in raw_results: try: q = Question.objects.get(pk=r['id']) results.append({ 'search_summary': clean_excerpt(q.content[0:500]), 'url': q.get_absolute_url(), 'title': q.title, 'type': 'question', 'object': q, 'is_solved': q.is_solved, 'num_answers': q.num_answers, 'num_votes': q.num_votes, 'num_votes_past_week': q.num_votes_past_week }) except Question.DoesNotExist: pass except (ESTimeoutError, ESMaxRetryError, ESException) as exc: if isinstance(exc, ESTimeoutError): statsd.incr('questions.suggestions.timeouterror') elif isinstance(exc, ESMaxRetryError): statsd.incr('questions.suggestions.maxretryerror') elif isinstance(exc, ESException): statsd.incr('questions.suggestions.elasticsearchexception') log.debug(exc) return results
def test_clean_excerpt(self): """clean_excerpt() should not allow disallowed HTML through.""" in_ = '<b>test</b> <div>the start of something</div>' out_ = '<b>test</b> <div>the start of something</div>' eq_(out_, clean_excerpt(in_))
def search(request, template=None): """Performs search or displays the search form.""" # JSON-specific variables is_json = (request.GET.get('format') == 'json') callback = request.GET.get('callback', '').strip() mimetype = 'application/x-javascript' if callback else 'application/json' # Search "Expires" header format expires_fmt = '%A, %d %B %Y %H:%M:%S GMT' # Check callback is valid if is_json and callback and not jsonp_is_valid(callback): return HttpResponse( json.dumps({'error': _('Invalid callback function.')}), mimetype=mimetype, status=400) language = locale_or_default(request.GET.get('language', request.locale)) r = request.GET.copy() a = request.GET.get('a', '0') # Search default values try: category = map(int, r.getlist('category')) or \ settings.SEARCH_DEFAULT_CATEGORIES except ValueError: category = settings.SEARCH_DEFAULT_CATEGORIES r.setlist('category', category) # Basic form if a == '0': r['w'] = r.get('w', constants.WHERE_BASIC) # Advanced form if a == '2': r['language'] = language r['a'] = '1' # TODO: Rewrite so SearchForm is unbound initially and we can use `initial` # on the form fields. if 'include_archived' not in r: r['include_archived'] = False search_form = SearchForm(r) if not search_form.is_valid() or a == '2': if is_json: return HttpResponse( json.dumps({'error': _('Invalid search data.')}), mimetype=mimetype, status=400) t = template if request.MOBILE else 'search/form.html' search_ = jingo.render(request, t, {'advanced': a, 'request': request, 'search_form': search_form}) search_['Cache-Control'] = 'max-age=%s' % \ (settings.SEARCH_CACHE_PERIOD * 60) search_['Expires'] = (datetime.utcnow() + timedelta( minutes=settings.SEARCH_CACHE_PERIOD)) \ .strftime(expires_fmt) return search_ cleaned = search_form.cleaned_data page = max(smart_int(request.GET.get('page')), 1) offset = (page - 1) * settings.SEARCH_RESULTS_PER_PAGE # get language name for display in template lang = language.lower() if settings.LANGUAGES.get(lang): lang_name = settings.LANGUAGES[lang] else: lang_name = '' wiki_s = wiki_search question_s = question_search discussion_s = discussion_search documents = [] # wiki filters # Category filter if cleaned['category']: wiki_s = wiki_s.filter(category__in=cleaned['category']) # Locale filter wiki_s = wiki_s.filter(locale=language) # Product filter products = cleaned['product'] for p in products: wiki_s = wiki_s.filter(tag=p) # Tags filter tags = [t.strip() for t in cleaned['tags'].split()] for t in tags: wiki_s = wiki_s.filter(tag=t) # Archived bit if a == '0' and not cleaned['include_archived']: # Default to NO for basic search: cleaned['include_archived'] = False if not cleaned['include_archived']: wiki_s = wiki_s.filter(is_archived=False) # End of wiki filters # Support questions specific filters if cleaned['w'] & constants.WHERE_SUPPORT: # Solved is set by default if using basic search if a == '0' and not cleaned['has_helpful']: cleaned['has_helpful'] = constants.TERNARY_YES # These filters are ternary, they can be either YES, NO, or OFF ternary_filters = ('is_locked', 'is_solved', 'has_answers', 'has_helpful') d = dict((filter_name, _ternary_filter(cleaned[filter_name])) for filter_name in ternary_filters if cleaned[filter_name]) if d: question_s = question_s.filter(**d) if cleaned['asked_by']: question_s = question_s.filter( question_creator=cleaned['asked_by']) if cleaned['answered_by']: question_s = question_s.filter( answer_creator=cleaned['answered_by']) q_tags = [t.strip() for t in cleaned['q_tags'].split()] for t in q_tags: question_s = question_s.filter(tag=t) # Discussion forum specific filters if cleaned['w'] & constants.WHERE_DISCUSSION: if cleaned['author']: discussion_s = discussion_s.filter(author_ord=cleaned['author']) if cleaned['thread_type']: if constants.DISCUSSION_STICKY in cleaned['thread_type']: discussion_s = discussion_s.filter(is_sticky=1) if constants.DISCUSSION_LOCKED in cleaned['thread_type']: discussion_s = discussion_s.filter(is_locked=1) if cleaned['forum']: discussion_s = discussion_s.filter(forum_id__in=cleaned['forum']) # Filters common to support and discussion forums # Created filter unix_now = int(time.time()) interval_filters = ( ('created', cleaned['created'], cleaned['created_date']), ('updated', cleaned['updated'], cleaned['updated_date']), ('question_votes', cleaned['num_voted'], cleaned['num_votes'])) for filter_name, filter_option, filter_date in interval_filters: if filter_option == constants.INTERVAL_BEFORE: before = {filter_name + '__gte': 0, filter_name + '__lte': max(filter_date, 0)} if filter_name != 'question_votes': discussion_s = discussion_s.filter(**before) question_s = question_s.filter(**before) elif filter_option == constants.INTERVAL_AFTER: after = {filter_name + '__gte': min(filter_date, unix_now), filter_name + '__lte': unix_now} if filter_name != 'question_votes': discussion_s = discussion_s.filter(**after) question_s = question_s.filter(**after) sortby = smart_int(request.GET.get('sortby')) try: max_results = settings.SEARCH_MAX_RESULTS cleaned_q = cleaned['q'] if cleaned['w'] & constants.WHERE_WIKI: wiki_s = wiki_s.query(cleaned_q)[:max_results] # Execute the query and append to documents documents += [('wiki', (pair[0], pair[1])) for pair in enumerate(wiki_s.object_ids())] if cleaned['w'] & constants.WHERE_SUPPORT: # Sort results by try: question_s = question_s.order_by( *constants.SORT_QUESTIONS[sortby]) except IndexError: pass question_s = question_s.highlight( 'content', before_match='<b>', after_match='</b>', limit=settings.SEARCH_SUMMARY_LENGTH) question_s = question_s.query(cleaned_q)[:max_results] documents += [('question', (pair[0], pair[1])) for pair in enumerate(question_s.object_ids())] if cleaned['w'] & constants.WHERE_DISCUSSION: # Sort results by try: # Note that the first attribute needs to be the same # here and in forums/models.py discussion_search. discussion_s = discussion_s.group_by( 'thread_id', constants.GROUPSORT[sortby]) except IndexError: pass discussion_s = discussion_s.highlight( 'content', before_match='<b>', after_match='</b>', limit=settings.SEARCH_SUMMARY_LENGTH) discussion_s = discussion_s.query(cleaned_q)[:max_results] documents += [('discussion', (pair[0], pair[1])) for pair in enumerate(discussion_s.object_ids())] except SearchError: if is_json: return HttpResponse(json.dumps({'error': _('Search Unavailable')}), mimetype=mimetype, status=503) t = 'search/mobile/down.html' if request.MOBILE else 'search/down.html' return jingo.render(request, t, {'q': cleaned['q']}, status=503) pages = paginate(request, documents, settings.SEARCH_RESULTS_PER_PAGE) # Build a dict of { type_ -> list of indexes } for the specific # docs that we're going to display on this page. This makes it # easy for us to slice the appropriate search Ss so we're limiting # our db hits to just the items we're showing. documents_dict = {} for doc in documents[offset:offset + settings.SEARCH_RESULTS_PER_PAGE]: documents_dict.setdefault(doc[0], []).append(doc[1][0]) docs_for_page = [] for type_, search_s in [('wiki', wiki_s), ('question', question_s), ('discussion', discussion_s)]: if type_ not in documents_dict: continue # documents_dict[type_] is a list of indexes--one for each # object id search result for that type_. We use the values # at the beginning and end of the list for slice boundaries. begin = documents_dict[type_][0] end = documents_dict[type_][-1] + 1 docs_for_page += [(type_, doc) for doc in search_s[begin:end]] results = [] for i, docinfo in enumerate(docs_for_page): rank = i + offset type_, doc = docinfo try: if type_ == 'wiki': summary = doc.current_revision.summary result = { 'search_summary': summary, 'url': doc.get_absolute_url(), 'title': doc.title, 'type': 'document', 'rank': rank, 'object': doc, } results.append(result) elif type_ == 'question': try: excerpt = question_s.excerpt(doc)[0] except ExcerptTimeoutError: statsd.incr('search.excerpt.timeout') excerpt = u'' except ExcerptSocketErrorError: statsd.incr('search.excerpt.socketerror') excerpt = u'' summary = jinja2.Markup(clean_excerpt(excerpt)) result = { 'search_summary': summary, 'url': doc.get_absolute_url(), 'title': doc.title, 'type': 'question', 'rank': rank, 'object': doc, } results.append(result) else: # discussion_s is based on Post--not Thread, so we have # to get this manually. thread = Thread.objects.get(pk=doc.thread_id) try: excerpt = discussion_s.excerpt(doc)[0] except ExcerptTimeoutError: statsd.incr('search.excerpt.timeout') excerpt = u'' except ExcerptSocketErrorError: statsd.incr('search.excerpt.socketerror') excerpt = u'' summary = jinja2.Markup(clean_excerpt(excerpt)) result = { 'search_summary': summary, 'url': thread.get_absolute_url(), 'title': thread.title, 'type': 'thread', 'rank': rank, 'object': thread, } results.append(result) except IndexError: break except ObjectDoesNotExist: continue items = [(k, v) for k in search_form.fields for v in r.getlist(k) if v and k != 'a'] items.append(('a', '2')) if is_json: # Models are not json serializable. for r in results: del r['object'] data = {} data['results'] = results data['total'] = len(results) data['query'] = cleaned['q'] if not results: data['message'] = _('No pages matched the search criteria') json_data = json.dumps(data) if callback: json_data = callback + '(' + json_data + ');' return HttpResponse(json_data, mimetype=mimetype) results_ = jingo.render(request, template, {'num_results': len(documents), 'results': results, 'q': cleaned['q'], 'pages': pages, 'w': cleaned['w'], 'search_form': search_form, 'lang_name': lang_name, }) results_['Cache-Control'] = 'max-age=%s' % \ (settings.SEARCH_CACHE_PERIOD * 60) results_['Expires'] = (datetime.utcnow() + timedelta(minutes=settings.SEARCH_CACHE_PERIOD)) \ .strftime(expires_fmt) results_.set_cookie(settings.LAST_SEARCH_COOKIE, urlquote(cleaned['q']), max_age=3600, secure=False, httponly=False) return results_
def _search_suggestions(request, query, locale, category_tags): """Return an iterable of the most relevant wiki pages and questions. query -- full text to search on locale -- locale to limit to Items are dicts of: { 'type': 'search_summary': 'title': 'url': 'object': } Returns up to 3 wiki pages, then up to 3 questions. """ if waffle.flag_is_active(request, 'elasticsearch'): engine = 'elastic' question_s = Question.search() wiki_s = Document.search() else: engine = 'sphinx' question_s = question_searcher(request) wiki_s = wiki_searcher(request) # Max number of search results per type. WIKI_RESULTS = QUESTIONS_RESULTS = 3 # Apply category filters if category_tags: question_s = question_s.filter(tag__in=category_tags) wiki_s = wiki_s.filter(tag__in=category_tags) try: raw_results = ( wiki_s.filter(locale=locale, category__in=settings.SEARCH_DEFAULT_CATEGORIES) .query(query) .values_dict('id')[:WIKI_RESULTS]) results = [] for r in raw_results: try: doc = (Document.objects.select_related('current_revision') .get(pk=r['id'])) results.append({ 'search_summary': clean_excerpt( doc.current_revision.summary), 'url': doc.get_absolute_url(), 'title': doc.title, 'type': 'document', 'object': doc, }) except Document.DoesNotExist: pass # Note: Questions app is en-US only. raw_results = (question_s.query(query) .values_dict('id')[:QUESTIONS_RESULTS]) for r in raw_results: try: q = Question.objects.get(pk=r['id']) results.append({ 'search_summary': clean_excerpt(q.content[0:500]), 'url': q.get_absolute_url(), 'title': q.title, 'type': 'question', 'object': q }) except Question.DoesNotExist: pass except (SearchError, ESTimeoutError, ESMaxRetryError, ESException), exc: if isinstance(exc, SearchError): statsd.incr('questions.suggestions.%s.searcherror' % engine) elif isinstance(exc, ESTimeoutError): statsd.incr('questions.suggestions.%s.timeouterror' % engine) elif isinstance(exc, ESMaxRetryError): statsd.incr('questions.suggestions.%s.maxretryerror' % engine) elif isinstance(exc, ESException): statsd.incr('questions.suggestions.%s.elasticsearchexception' % engine) return []
def _search_suggestions(request, text, locale, product_slugs): """Return an iterable of the most relevant wiki pages and questions. :arg text: full text to search on :arg locale: locale to limit to :arg product_slugs: list of product slugs to filter articles on (["desktop", "mobile", ...]) Items are dicts of:: { 'type': 'search_summary': 'title': 'url': 'object': } :returns: up to 3 wiki pages, then up to 3 questions. """ # TODO: this can be reworked to pull data from ES rather than # hit the db. question_s = Question.search() wiki_s = Document.search() # Max number of search results per type. WIKI_RESULTS = QUESTIONS_RESULTS = 3 default_categories = settings.SEARCH_DEFAULT_CATEGORIES # Apply product filters if product_slugs: wiki_s = wiki_s.filter(product__in=product_slugs) question_s = question_s.filter(product__in=product_slugs) results = [] try: # Search for relevant KB documents. query = dict(('%s__text' % field, text) for field in Document.get_query_fields()) query.update(dict(('%s__text_phrase' % field, text) for field in Document.get_query_fields())) filter = F() filter |= F(document_locale=locale) filter |= F(document_locale=settings.WIKI_DEFAULT_LANGUAGE) filter &= F(document_category__in=default_categories) filter &= F(document_is_archived=False) raw_results = ( wiki_s.filter(filter) .query(or_=query) .values_dict('id')[:WIKI_RESULTS]) for r in raw_results: try: doc = (Document.objects.select_related('current_revision') .get(pk=r['id'])) results.append({ 'search_summary': clean_excerpt( doc.current_revision.summary), 'url': doc.get_absolute_url(), 'title': doc.title, 'type': 'document', 'object': doc, }) except Document.DoesNotExist: pass # Search for relevant questions. query = dict(('%s__text' % field, text) for field in Question.get_query_fields()) query.update(dict(('%s__text_phrase' % field, text) for field in Question.get_query_fields())) max_age = int(time.time()) - settings.SEARCH_DEFAULT_MAX_QUESTION_AGE # Filter questions by language. Questions should be either in English # or in the locale's language. This is because we have some questions # marked English that are really in other languages. The assumption # being that if a native speakers submits a query in given language, # the items that are written in that language will automatically match # better, so questions incorrectly marked as english can be found too. question_filter = F(question_locale=locale) question_filter |= F(question_locale=settings.WIKI_DEFAULT_LANGUAGE) question_filter &= F(updated__gte=max_age) raw_results = (question_s .query(or_=query) .filter(question_filter) .values_dict('id')[:QUESTIONS_RESULTS]) for r in raw_results: try: q = Question.objects.get(pk=r['id']) results.append({ 'search_summary': clean_excerpt(q.content[0:500]), 'url': q.get_absolute_url(), 'title': q.title, 'type': 'question', 'object': q, 'is_solved': q.is_solved, 'num_answers': q.num_answers, 'num_votes': q.num_votes, 'num_votes_past_week': q.num_votes_past_week }) except Question.DoesNotExist: pass except (ESTimeoutError, ESMaxRetryError, ESException) as exc: if isinstance(exc, ESTimeoutError): statsd.incr('questions.suggestions.timeouterror') elif isinstance(exc, ESMaxRetryError): statsd.incr('questions.suggestions.maxretryerror') elif isinstance(exc, ESException): statsd.incr('questions.suggestions.elasticsearchexception') log.debug(exc) return results
def _search_suggestions_es(request, query, locale, category_tags): """See _search_suggestions """ # TODO: this can be reworked to pull data from ES rather than # hit the db. engine = 'elastic' question_s = Question.search() wiki_s = Document.search() # Max number of search results per type. WIKI_RESULTS = QUESTIONS_RESULTS = 3 default_categories = settings.SEARCH_DEFAULT_CATEGORIES # Apply category filters if category_tags: question_s = question_s.filter(question_tag__in=category_tags) wiki_s = wiki_s.filter(document_tag__in=category_tags) try: raw_results = ( wiki_s.filter(document_locale=locale, document_category__in=default_categories) .query(query) .values_dict('id')[:WIKI_RESULTS]) results = [] for r in raw_results: try: doc = (Document.objects.select_related('current_revision') .get(pk=r['id'])) results.append({ 'search_summary': clean_excerpt( doc.current_revision.summary), 'url': doc.get_absolute_url(), 'title': doc.title, 'type': 'document', 'object': doc, }) except Document.DoesNotExist: pass # Note: Questions app is en-US only. raw_results = (question_s.query(query) .values_dict('id')[:QUESTIONS_RESULTS]) for r in raw_results: try: q = Question.objects.get(pk=r['id']) results.append({ 'search_summary': clean_excerpt(q.content[0:500]), 'url': q.get_absolute_url(), 'title': q.title, 'type': 'question', 'object': q, 'is_solved': q.is_solved, 'num_answers': q.num_answers, 'num_votes': q.num_votes, 'num_votes_past_week': q.num_votes_past_week }) except Question.DoesNotExist: pass except (SearchError, ESTimeoutError, ESMaxRetryError, ESException), exc: if isinstance(exc, SearchError): statsd.incr('questions.suggestions.%s.searcherror' % engine) elif isinstance(exc, ESTimeoutError): statsd.incr('questions.suggestions.%s.timeouterror' % engine) elif isinstance(exc, ESMaxRetryError): statsd.incr('questions.suggestions.%s.maxretryerror' % engine) elif isinstance(exc, ESException): statsd.incr('questions.suggestions.%s.elasticsearchexception' % engine) return []
def search(request, template=None): """Performs search or displays the search form.""" # JSON-specific variables is_json = (request.GET.get('format') == 'json') callback = request.GET.get('callback', '').strip() mimetype = 'application/x-javascript' if callback else 'application/json' if waffle.flag_is_active(request, 'elasticsearch'): engine = 'elastic' else: engine = 'sphinx' # Search "Expires" header format expires_fmt = '%A, %d %B %Y %H:%M:%S GMT' # Check callback is valid if is_json and callback and not jsonp_is_valid(callback): return HttpResponse( json.dumps({'error': _('Invalid callback function.')}), mimetype=mimetype, status=400) language = locale_or_default(request.GET.get('language', request.locale)) r = request.GET.copy() a = request.GET.get('a', '0') # Search default values try: category = map(int, r.getlist('category')) or \ settings.SEARCH_DEFAULT_CATEGORIES except ValueError: category = settings.SEARCH_DEFAULT_CATEGORIES r.setlist('category', category) # Basic form if a == '0': r['w'] = r.get('w', constants.WHERE_BASIC) # Advanced form if a == '2': r['language'] = language r['a'] = '1' # TODO: Rewrite so SearchForm is unbound initially and we can use `initial` # on the form fields. if 'include_archived' not in r: r['include_archived'] = False search_form = SearchForm(r) if not search_form.is_valid() or a == '2': if is_json: return HttpResponse( json.dumps({'error': _('Invalid search data.')}), mimetype=mimetype, status=400) t = template if request.MOBILE else 'search/form.html' search_ = jingo.render(request, t, {'advanced': a, 'request': request, 'search_form': search_form}) search_['Cache-Control'] = 'max-age=%s' % \ (settings.SEARCH_CACHE_PERIOD * 60) search_['Expires'] = (datetime.utcnow() + timedelta( minutes=settings.SEARCH_CACHE_PERIOD)) \ .strftime(expires_fmt) return search_ cleaned = search_form.cleaned_data page = max(smart_int(request.GET.get('page')), 1) offset = (page - 1) * settings.SEARCH_RESULTS_PER_PAGE # get language name for display in template lang = language.lower() if settings.LANGUAGES.get(lang): lang_name = settings.LANGUAGES[lang] else: lang_name = '' wiki_s = wiki_searcher(request) question_s = question_searcher(request) discussion_s = discussion_searcher(request) documents = [] # wiki filters # Category filter if cleaned['category']: wiki_s = wiki_s.filter(category__in=cleaned['category']) # Locale filter wiki_s = wiki_s.filter(locale=language) # Product filter products = cleaned['product'] for p in products: wiki_s = wiki_s.filter(tag=p) # Tags filter tags = [t.strip() for t in cleaned['tags'].split()] for t in tags: wiki_s = wiki_s.filter(tag=t) # Archived bit if a == '0' and not cleaned['include_archived']: # Default to NO for basic search: cleaned['include_archived'] = False if not cleaned['include_archived']: wiki_s = wiki_s.filter(is_archived=False) # End of wiki filters # Support questions specific filters if cleaned['w'] & constants.WHERE_SUPPORT: # Solved is set by default if using basic search if a == '0' and not cleaned['has_helpful']: cleaned['has_helpful'] = constants.TERNARY_YES # These filters are ternary, they can be either YES, NO, or OFF ternary_filters = ('is_locked', 'is_solved', 'has_answers', 'has_helpful') d = dict((filter_name, _ternary_filter(cleaned[filter_name])) for filter_name in ternary_filters if cleaned[filter_name]) if d: question_s = question_s.filter(**d) if cleaned['asked_by']: question_s = question_s.filter( question_creator=cleaned['asked_by']) if cleaned['answered_by']: question_s = question_s.filter( answer_creator=cleaned['answered_by']) q_tags = [t.strip() for t in cleaned['q_tags'].split()] for t in q_tags: question_s = question_s.filter(tag=t) # Discussion forum specific filters if cleaned['w'] & constants.WHERE_DISCUSSION: if cleaned['author']: discussion_s = discussion_s.filter(author_ord=cleaned['author']) if cleaned['thread_type']: if constants.DISCUSSION_STICKY in cleaned['thread_type']: discussion_s = discussion_s.filter(is_sticky=1) if constants.DISCUSSION_LOCKED in cleaned['thread_type']: discussion_s = discussion_s.filter(is_locked=1) if cleaned['forum']: discussion_s = discussion_s.filter(forum_id__in=cleaned['forum']) # Filters common to support and discussion forums # Created filter unix_now = int(time.time()) interval_filters = ( ('created', cleaned['created'], cleaned['created_date']), ('updated', cleaned['updated'], cleaned['updated_date']), ('question_votes', cleaned['num_voted'], cleaned['num_votes'])) for filter_name, filter_option, filter_date in interval_filters: if filter_option == constants.INTERVAL_BEFORE: before = {filter_name + '__gte': 0, filter_name + '__lte': max(filter_date, 0)} if filter_name != 'question_votes': discussion_s = discussion_s.filter(**before) question_s = question_s.filter(**before) elif filter_option == constants.INTERVAL_AFTER: after = {filter_name + '__gte': min(filter_date, unix_now), filter_name + '__lte': unix_now} if filter_name != 'question_votes': discussion_s = discussion_s.filter(**after) question_s = question_s.filter(**after) sortby = smart_int(request.GET.get('sortby')) try: max_results = settings.SEARCH_MAX_RESULTS cleaned_q = cleaned['q'] if cleaned['w'] & constants.WHERE_WIKI: if cleaned_q: wiki_s = wiki_s.query(cleaned_q) wiki_s = wiki_s[:max_results] # Execute the query and append to documents documents += [('wiki', (pair[0], pair[1])) for pair in enumerate(wiki_s.object_ids())] if cleaned['w'] & constants.WHERE_SUPPORT: # Sort results by try: question_s = question_s.order_by( *constants.SORT_QUESTIONS[sortby]) except IndexError: pass if engine == 'elastic': highlight_fields = ['title', 'question_content', 'answer_content'] else: highlight_fields = ['content'] question_s = question_s.highlight( *highlight_fields, before_match='<b>', after_match='</b>', limit=settings.SEARCH_SUMMARY_LENGTH) if cleaned_q: question_s = question_s.query(cleaned_q) question_s = question_s[:max_results] documents += [('question', (pair[0], pair[1])) for pair in enumerate(question_s.object_ids())] if cleaned['w'] & constants.WHERE_DISCUSSION: # Sort results by try: # Note that the first attribute needs to be the same # here and in forums/models.py discussion_search. discussion_s = discussion_s.group_by( 'thread_id', constants.GROUPSORT[sortby]) except IndexError: pass discussion_s = discussion_s.highlight( 'content', before_match='<b>', after_match='</b>', limit=settings.SEARCH_SUMMARY_LENGTH) if cleaned_q: discussion_s = discussion_s.query(cleaned_q) discussion_s = discussion_s[:max_results] documents += [('discussion', (pair[0], pair[1])) for pair in enumerate(discussion_s.object_ids())] pages = paginate(request, documents, settings.SEARCH_RESULTS_PER_PAGE) # Build a dict of { type_ -> list of indexes } for the specific # docs that we're going to display on this page. This makes it # easy for us to slice the appropriate search Ss so we're limiting # our db hits to just the items we're showing. documents_dict = {} for doc in documents[offset:offset + settings.SEARCH_RESULTS_PER_PAGE]: documents_dict.setdefault(doc[0], []).append(doc[1][0]) docs_for_page = [] for kind, search_s in [('wiki', wiki_s), ('question', question_s), ('discussion', discussion_s)]: if kind not in documents_dict: continue # documents_dict[type_] is a list of indexes--one for each # object id search result for that type_. We use the values # at the beginning and end of the list for slice boundaries. begin = documents_dict[kind][0] end = documents_dict[kind][-1] + 1 search_s = search_s[begin:end] if engine == 'elastic': # If we're doing elasticsearch, then we need to update # the _s variables to point to the sliced versions of # S so that, when we iterate over them in the # following list comp, we hang onto the version that # does the query, so we can call excerpt() on it # later. # # We only need to do this with elasticsearch. For Sphinx, # search_s at this point is an ObjectResults and not an S # because we've already acquired object_ids on it. Thus # if we update the _s variables, we'd be pointing to the # ObjectResults and not the S and then excerpting breaks. # # Ugh. if kind == 'wiki': wiki_s = search_s elif kind == 'question': question_s = search_s elif kind == 'discussion': discussion_s = search_s docs_for_page += [(kind, doc) for doc in search_s] results = [] for i, docinfo in enumerate(docs_for_page): rank = i + offset type_, doc = docinfo try: if type_ == 'wiki': summary = doc.current_revision.summary result = { 'search_summary': summary, 'url': doc.get_absolute_url(), 'title': doc.title, 'type': 'document', 'rank': rank, 'object': doc, } results.append(result) elif type_ == 'question': try: excerpt = excerpt_joiner.join( [m for m in chain(*question_s.excerpt(doc)) if m]) except ExcerptTimeoutError: statsd.incr('search.excerpt.timeout') excerpt = u'' except ExcerptSocketError: statsd.incr('search.excerpt.socketerror') excerpt = u'' summary = jinja2.Markup(clean_excerpt(excerpt)) result = { 'search_summary': summary, 'url': doc.get_absolute_url(), 'title': doc.title, 'type': 'question', 'rank': rank, 'object': doc, } results.append(result) else: if engine == 'elastic': thread = doc else: thread = Thread.objects.get(pk=doc.thread_id) try: excerpt = excerpt_joiner.join( [m for m in chain(*discussion_s.excerpt(doc))]) except ExcerptTimeoutError: statsd.incr('search.excerpt.timeout') excerpt = u'' except ExcerptSocketError: statsd.incr('search.excerpt.socketerror') excerpt = u'' summary = jinja2.Markup(clean_excerpt(excerpt)) result = { 'search_summary': summary, 'url': thread.get_absolute_url(), 'title': thread.title, 'type': 'thread', 'rank': rank, 'object': thread, } results.append(result) except IndexError: break except ObjectDoesNotExist: continue except (SearchError, ESTimeoutError, ESMaxRetryError), exc: # Handle timeout and all those other transient errors with a # "Search Unavailable" rather than a Django error page. if is_json: return HttpResponse(json.dumps({'error': _('Search Unavailable')}), mimetype=mimetype, status=503) if isinstance(exc, SearchError): statsd.incr('search.%s.searcherror' % engine) elif isinstance(exc, ESTimeoutError): statsd.incr('search.%s.timeouterror' % engine) elif isinstance(exc, ESMaxRetryError): statsd.incr('search.%s.maxretryerror' % engine) t = 'search/mobile/down.html' if request.MOBILE else 'search/down.html' return jingo.render(request, t, {'q': cleaned['q']}, status=503)
def _search_suggestions(request, query, locale, tags, product_slugs): """Return an iterable of the most relevant wiki pages and questions. query -- full text to search on locale -- locale to limit to tags -- list of tags to filter questions on product_slugs -- list of product slugs to filter articles on (["desktop", "mobile", ...]) Items are dicts of: { 'type': 'search_summary': 'title': 'url': 'object': } Returns up to 3 wiki pages, then up to 3 questions. """ # TODO: this can be reworked to pull data from ES rather than # hit the db. question_s = Question.search() wiki_s = Document.search() # Max number of search results per type. WIKI_RESULTS = QUESTIONS_RESULTS = 3 default_categories = settings.SEARCH_DEFAULT_CATEGORIES # Apply product filters if product_slugs: wiki_s = wiki_s.filter(document_product__in=product_slugs) if tags: question_s = question_s.filter(question_tag__in=tags) try: raw_results = ( wiki_s.filter(document_locale=locale, document_category__in=default_categories) .query(query) .values_dict("id")[:WIKI_RESULTS] ) results = [] for r in raw_results: try: doc = Document.objects.select_related("current_revision").get(pk=r["id"]) results.append( { "search_summary": clean_excerpt(doc.current_revision.summary), "url": doc.get_absolute_url(), "title": doc.title, "type": "document", "object": doc, } ) except Document.DoesNotExist: pass # Note: Questions app is en-US only. raw_results = question_s.query(query).values_dict("id")[:QUESTIONS_RESULTS] for r in raw_results: try: q = Question.objects.get(pk=r["id"]) results.append( { "search_summary": clean_excerpt(q.content[0:500]), "url": q.get_absolute_url(), "title": q.title, "type": "question", "object": q, "is_solved": q.is_solved, "num_answers": q.num_answers, "num_votes": q.num_votes, "num_votes_past_week": q.num_votes_past_week, } ) except Question.DoesNotExist: pass except (ESTimeoutError, ESMaxRetryError, ESException), exc: if isinstance(exc, ESTimeoutError): statsd.incr("questions.suggestions.timeouterror") elif isinstance(exc, ESMaxRetryError): statsd.incr("questions.suggestions.maxretryerror") elif isinstance(exc, ESException): statsd.incr("questions.suggestions.elasticsearchexception") return []
def _search_suggestions(request, query, locale, tags, product_slugs): """Return an iterable of the most relevant wiki pages and questions. query -- full text to search on locale -- locale to limit to tags -- list of tags to filter questions on product_slugs -- list of product slugs to filter articles on (["desktop", "mobile", ...]) Items are dicts of: { 'type': 'search_summary': 'title': 'url': 'object': } Returns up to 3 wiki pages, then up to 3 questions. """ # TODO: this can be reworked to pull data from ES rather than # hit the db. question_s = Question.search() wiki_s = Document.search() # Max number of search results per type. WIKI_RESULTS = QUESTIONS_RESULTS = 3 default_categories = settings.SEARCH_DEFAULT_CATEGORIES # Apply product filters if product_slugs: wiki_s = wiki_s.filter(document_product__in=product_slugs) if tags: question_s = question_s.filter(question_tag__in=tags) try: raw_results = (wiki_s.filter( document_locale=locale, document_category__in=default_categories).query(query).values_dict( 'id')[:WIKI_RESULTS]) results = [] for r in raw_results: try: doc = (Document.objects.select_related('current_revision').get( pk=r['id'])) results.append({ 'search_summary': clean_excerpt(doc.current_revision.summary), 'url': doc.get_absolute_url(), 'title': doc.title, 'type': 'document', 'object': doc, }) except Document.DoesNotExist: pass # Note: Questions app is en-US only. raw_results = ( question_s.query(query).values_dict('id')[:QUESTIONS_RESULTS]) for r in raw_results: try: q = Question.objects.get(pk=r['id']) results.append({ 'search_summary': clean_excerpt(q.content[0:500]), 'url': q.get_absolute_url(), 'title': q.title, 'type': 'question', 'object': q, 'is_solved': q.is_solved, 'num_answers': q.num_answers, 'num_votes': q.num_votes, 'num_votes_past_week': q.num_votes_past_week }) except Question.DoesNotExist: pass except (ESTimeoutError, ESMaxRetryError, ESException), exc: if isinstance(exc, ESTimeoutError): statsd.incr('questions.suggestions.timeouterror') elif isinstance(exc, ESMaxRetryError): statsd.incr('questions.suggestions.maxretryerror') elif isinstance(exc, ESException): statsd.incr('questions.suggestions.elasticsearchexception') return []
def search(request, template=None): """Performs search or displays the search form.""" # JSON-specific variables is_json = (request.GET.get('format') == 'json') callback = request.GET.get('callback', '').strip() mimetype = 'application/x-javascript' if callback else 'application/json' if waffle.flag_is_active(request, 'elasticsearch'): engine = 'elastic' else: engine = 'sphinx' # Search "Expires" header format expires_fmt = '%A, %d %B %Y %H:%M:%S GMT' # Check callback is valid if is_json and callback and not jsonp_is_valid(callback): return HttpResponse( json.dumps({'error': _('Invalid callback function.')}), mimetype=mimetype, status=400) language = locale_or_default(request.GET.get('language', request.locale)) r = request.GET.copy() a = request.GET.get('a', '0') # Search default values try: category = map(int, r.getlist('category')) or \ settings.SEARCH_DEFAULT_CATEGORIES except ValueError: category = settings.SEARCH_DEFAULT_CATEGORIES r.setlist('category', category) # Basic form if a == '0': r['w'] = r.get('w', constants.WHERE_BASIC) # Advanced form if a == '2': r['language'] = language r['a'] = '1' # TODO: Rewrite so SearchForm is unbound initially and we can use `initial` # on the form fields. if 'include_archived' not in r: r['include_archived'] = False search_form = SearchForm(r) if not search_form.is_valid() or a == '2': if is_json: return HttpResponse( json.dumps({'error': _('Invalid search data.')}), mimetype=mimetype, status=400) t = template if request.MOBILE else 'search/form.html' search_ = jingo.render(request, t, {'advanced': a, 'request': request, 'search_form': search_form}) search_['Cache-Control'] = 'max-age=%s' % \ (settings.SEARCH_CACHE_PERIOD * 60) search_['Expires'] = (datetime.utcnow() + timedelta( minutes=settings.SEARCH_CACHE_PERIOD)) \ .strftime(expires_fmt) return search_ cleaned = search_form.cleaned_data page = max(smart_int(request.GET.get('page')), 1) offset = (page - 1) * settings.SEARCH_RESULTS_PER_PAGE # get language name for display in template lang = language.lower() if settings.LANGUAGES.get(lang): lang_name = settings.LANGUAGES[lang] else: lang_name = '' wiki_s = wiki_searcher(request) question_s = question_searcher(request) discussion_s = discussion_searcher(request) documents = [] # wiki filters # Category filter if cleaned['category']: wiki_s = wiki_s.filter(category__in=cleaned['category']) # Locale filter wiki_s = wiki_s.filter(locale=language) # Product filter products = cleaned['product'] for p in products: wiki_s = wiki_s.filter(tag=p) # Tags filter tags = [t.strip() for t in cleaned['tags'].split()] for t in tags: wiki_s = wiki_s.filter(tag=t) # Archived bit if a == '0' and not cleaned['include_archived']: # Default to NO for basic search: cleaned['include_archived'] = False if not cleaned['include_archived']: wiki_s = wiki_s.filter(is_archived=False) # End of wiki filters # Support questions specific filters if cleaned['w'] & constants.WHERE_SUPPORT: # Solved is set by default if using basic search if a == '0' and not cleaned['has_helpful']: cleaned['has_helpful'] = constants.TERNARY_YES # These filters are ternary, they can be either YES, NO, or OFF ternary_filters = ('is_locked', 'is_solved', 'has_answers', 'has_helpful') d = dict((filter_name, _ternary_filter(cleaned[filter_name])) for filter_name in ternary_filters if cleaned[filter_name]) if d: question_s = question_s.filter(**d) if cleaned['asked_by']: question_s = question_s.filter( question_creator=cleaned['asked_by']) if cleaned['answered_by']: question_s = question_s.filter( answer_creator=cleaned['answered_by']) q_tags = [t.strip() for t in cleaned['q_tags'].split()] for t in q_tags: question_s = question_s.filter(tag=t) # Discussion forum specific filters if cleaned['w'] & constants.WHERE_DISCUSSION: if cleaned['author']: discussion_s = discussion_s.filter(author_ord=cleaned['author']) if cleaned['thread_type']: if constants.DISCUSSION_STICKY in cleaned['thread_type']: discussion_s = discussion_s.filter(is_sticky=1) if constants.DISCUSSION_LOCKED in cleaned['thread_type']: discussion_s = discussion_s.filter(is_locked=1) if cleaned['forum']: discussion_s = discussion_s.filter(forum_id__in=cleaned['forum']) # Filters common to support and discussion forums # Created filter unix_now = int(time.time()) interval_filters = ( ('created', cleaned['created'], cleaned['created_date']), ('updated', cleaned['updated'], cleaned['updated_date']), ('question_votes', cleaned['num_voted'], cleaned['num_votes'])) for filter_name, filter_option, filter_date in interval_filters: if filter_option == constants.INTERVAL_BEFORE: before = {filter_name + '__gte': 0, filter_name + '__lte': max(filter_date, 0)} if filter_name != 'question_votes': discussion_s = discussion_s.filter(**before) question_s = question_s.filter(**before) elif filter_option == constants.INTERVAL_AFTER: after = {filter_name + '__gte': min(filter_date, unix_now), filter_name + '__lte': unix_now} if filter_name != 'question_votes': discussion_s = discussion_s.filter(**after) question_s = question_s.filter(**after) sortby = smart_int(request.GET.get('sortby')) try: max_results = settings.SEARCH_MAX_RESULTS cleaned_q = cleaned['q'] if cleaned['w'] & constants.WHERE_WIKI: wiki_s = wiki_s.query(cleaned_q)[:max_results] # Execute the query and append to documents documents += [('wiki', (pair[0], pair[1])) for pair in enumerate(wiki_s.object_ids())] if cleaned['w'] & constants.WHERE_SUPPORT: # Sort results by try: question_s = question_s.order_by( *constants.SORT_QUESTIONS[sortby]) except IndexError: pass if engine == 'elastic': highlight_fields = ['title', 'question_content', 'answer_content'] else: highlight_fields = ['content'] question_s = question_s.highlight( *highlight_fields, before_match='<b>', after_match='</b>', limit=settings.SEARCH_SUMMARY_LENGTH) question_s = question_s.query(cleaned_q)[:max_results] documents += [('question', (pair[0], pair[1])) for pair in enumerate(question_s.object_ids())] if cleaned['w'] & constants.WHERE_DISCUSSION: # Sort results by try: # Note that the first attribute needs to be the same # here and in forums/models.py discussion_search. discussion_s = discussion_s.group_by( 'thread_id', constants.GROUPSORT[sortby]) except IndexError: pass discussion_s = discussion_s.highlight( 'content', before_match='<b>', after_match='</b>', limit=settings.SEARCH_SUMMARY_LENGTH) discussion_s = discussion_s.query(cleaned_q)[:max_results] documents += [('discussion', (pair[0], pair[1])) for pair in enumerate(discussion_s.object_ids())] pages = paginate(request, documents, settings.SEARCH_RESULTS_PER_PAGE) # Build a dict of { type_ -> list of indexes } for the specific # docs that we're going to display on this page. This makes it # easy for us to slice the appropriate search Ss so we're limiting # our db hits to just the items we're showing. documents_dict = {} for doc in documents[offset:offset + settings.SEARCH_RESULTS_PER_PAGE]: documents_dict.setdefault(doc[0], []).append(doc[1][0]) docs_for_page = [] for kind, search_s in [('wiki', wiki_s), ('question', question_s), ('discussion', discussion_s)]: if kind not in documents_dict: continue # documents_dict[type_] is a list of indexes--one for each # object id search result for that type_. We use the values # at the beginning and end of the list for slice boundaries. begin = documents_dict[kind][0] end = documents_dict[kind][-1] + 1 search_s = search_s[begin:end] if engine == 'elastic': # If we're doing elasticsearch, then we need to update # the _s variables to point to the sliced versions of # S so that, when we iterate over them in the # following list comp, we hang onto the version that # does the query, so we can call excerpt() on it # later. # # We only need to do this with elasticsearch. For Sphinx, # search_s at this point is an ObjectResults and not an S # because we've already acquired object_ids on it. Thus # if we update the _s variables, we'd be pointing to the # ObjectResults and not the S and then excerpting breaks. # # Ugh. if kind == 'wiki': wiki_s = search_s elif kind == 'question': question_s = search_s elif kind == 'discussion': discussion_s = search_s docs_for_page += [(kind, doc) for doc in search_s] results = [] for i, docinfo in enumerate(docs_for_page): rank = i + offset type_, doc = docinfo try: if type_ == 'wiki': summary = doc.current_revision.summary result = { 'search_summary': summary, 'url': doc.get_absolute_url(), 'title': doc.title, 'type': 'document', 'rank': rank, 'object': doc, } results.append(result) elif type_ == 'question': try: excerpt = excerpt_joiner.join( [m for m in chain(*question_s.excerpt(doc)) if m]) except ExcerptTimeoutError: statsd.incr('search.excerpt.timeout') excerpt = u'' except ExcerptSocketError: statsd.incr('search.excerpt.socketerror') excerpt = u'' summary = jinja2.Markup(clean_excerpt(excerpt)) result = { 'search_summary': summary, 'url': doc.get_absolute_url(), 'title': doc.title, 'type': 'question', 'rank': rank, 'object': doc, } results.append(result) else: if engine == 'elastic': thread = doc else: thread = Thread.objects.get(pk=doc.thread_id) try: excerpt = excerpt_joiner.join( [m for m in chain(*discussion_s.excerpt(doc))]) except ExcerptTimeoutError: statsd.incr('search.excerpt.timeout') excerpt = u'' except ExcerptSocketError: statsd.incr('search.excerpt.socketerror') excerpt = u'' summary = jinja2.Markup(clean_excerpt(excerpt)) result = { 'search_summary': summary, 'url': thread.get_absolute_url(), 'title': thread.title, 'type': 'thread', 'rank': rank, 'object': thread, } results.append(result) except IndexError: break except ObjectDoesNotExist: continue except (SearchError, ESTimeoutError, ESMaxRetryError), exc: # Handle timeout and all those other transient errors with a # "Search Unavailable" rather than a Django error page. if is_json: return HttpResponse(json.dumps({'error': _('Search Unavailable')}), mimetype=mimetype, status=503) if isinstance(exc, SearchError): statsd.incr('search.%s.searcherror' % engine) elif isinstance(exc, ESTimeoutError): statsd.incr('search.%s.timeouterror' % engine) elif isinstance(exc, ESMaxRetryError): statsd.incr('search.%s.maxretryerror' % engine) t = 'search/mobile/down.html' if request.MOBILE else 'search/down.html' return jingo.render(request, t, {'q': cleaned['q']}, status=503)