def get_context_data(self, **kwargs): context = super().get_context_data(**kwargs) top_level_pages = (DocPage.objects.filter( parent__isnull=True).prefetch_related( "children__children").order_by("order").all()) qs = DocPage.objects.all() keywords = self.request.GET.get("query") if keywords: query = SearchQuery(keywords) vector = SearchVector("title", "content") headline = SearchHeadline("content", query) qs = (qs.annotate(headline=headline).annotate( rank=SearchRank(vector, query)).annotate( similarity=TrigramSimilarity("title", keywords) + TrigramSimilarity("content", keywords)).annotate( combined_score=(F("similarity") + F("rank")) / 2).filter(Q(rank__gt=0.001) | Q( similarity__gt=0.1)).order_by("-combined_score")) else: qs = None context.update({ "top_level_pages": top_level_pages, "search_results": qs, "query": keywords, }) return context
def test_headline_untyped_args(self): searched = Line.objects.annotate(headline=SearchHeadline( "dialogue", "killed", config="english"), ).get(pk=self.verse0.pk) self.assertEqual( searched.headline, "Robin. He was not at all afraid to be <b>killed</b> in nasty " "ways. Brave, brave, brave, brave Sir Robin", )
def test_headline_untyped_args(self): searched = Line.objects.annotate(headline=SearchHeadline( 'dialogue', 'killed', config='english'), ).get(pk=self.verse0.pk) self.assertEqual( searched.headline, 'Robin. He was not at all afraid to be <b>killed</b> in nasty ' 'ways. Brave, brave, brave, brave Sir Robin!', )
def article_list(request): page = page_for_app_request(request) page.activate_language(request) article_list = articles_for_page(page) category_list = Category.objects.filter(article__in=article_list).distinct() if request.GET.get("category", None): article_list = article_list.filter( category__slug__in=request.GET.getlist("category") ) q = "" if request.GET.get("search", ""): vector = ( SearchVector("title", weight="A") + SearchVector("category__name", weight="A") + SearchVector("blog_richtext_set__text", weight="A") + SearchVector("blog_glossaryrichtext_set__text", weight="A") + SearchVector("author__first_name", weight="B") + SearchVector("author__last_name", weight="B") ) query = consume(request.GET["search"]) q = request.GET["search"] article_list = ( article_list.annotate( rank=SearchRank(vector, query, cover_density=True), headline=SearchHeadline( 'blog_richtext_set__text', query, max_words=25, min_words=20, max_fragments=2, ) ) .filter(rank__gt=0) .order_by("-rank") .distinct() ) ancestors = list(page.ancestors().reverse()) return render_list( request, article_list, { "page": page, "q": q, "header_image": page.get_header_image(), "vapid_public_key": settings.VAPID_PUBLIC_KEY, "category_list": category_list, "meta_tags": meta_tags([page] + ancestors, request=request), "regions": Regions.from_item( page, renderer=pages.renderer.renderer, timeout=60, ), }, paginate_by=12, )
def test_headline_with_config_from_field(self): searched = Line.objects.annotate(headline=SearchHeadline( 'dialogue', SearchQuery('cadeaux', config=F('dialogue_config')), config=F('dialogue_config'), ), ).get(pk=self.french.pk) self.assertEqual( searched.headline, 'Oh. Un beau <b>cadeau</b>. Oui oui.', )
def test_headline_short_word_option(self): searched = Line.objects.annotate(headline=SearchHeadline( 'dialogue', SearchQuery('brave sir robin', config='english'), short_word=6, ), ).get(pk=self.verse0.pk) self.assertIs( searched.headline.endswith( '<b>Brave</b>, <b>brave</b>, <b>brave</b>, <b>brave</b> <b>Sir</b>' ), True)
def test_headline_with_config_from_field(self): searched = Line.objects.annotate(headline=SearchHeadline( "dialogue", SearchQuery("cadeaux", config=F("dialogue_config")), config=F("dialogue_config"), ), ).get(pk=self.french.pk) self.assertEqual( searched.headline, "Oh. Un beau <b>cadeau</b>. Oui oui.", )
def test_headline_short_word_option(self): searched = Line.objects.annotate(headline=SearchHeadline( 'dialogue', SearchQuery('Camelot', config='english'), short_word=5, min_words=8, ), ).get(pk=self.verse0.pk) self.assertEqual( searched.headline, ('<b>Camelot</b>. He was not afraid to die, o Brave Sir Robin. He ' 'was not at all afraid'))
def test_headline_highlight_all_option(self): searched = Line.objects.annotate(headline=SearchHeadline( 'dialogue', SearchQuery('brave sir robin', config='english'), highlight_all=True, ), ).get(pk=self.verse0.pk) self.assertIn( '<b>Bravely</b> bold <b>Sir</b> <b>Robin</b>, rode forth from ' 'Camelot. He was not afraid to die, o ', searched.headline, )
def test_headline_short_word_option(self): searched = Line.objects.annotate(headline=SearchHeadline( "dialogue", SearchQuery("Camelot", config="english"), short_word=5, min_words=8, ), ).get(pk=self.verse0.pk) self.assertEqual( searched.headline, ("<b>Camelot</b>. He was not afraid to die, o Brave Sir Robin. He " "was not at all afraid"), )
def test_headline(self): searched = Line.objects.annotate(headline=SearchHeadline( F("dialogue"), SearchQuery("brave sir robin"), config=SearchConfig("english"), ), ).get(pk=self.verse0.pk) self.assertEqual( searched.headline, "<b>Robin</b>. He was not at all afraid to be killed in nasty " "ways. <b>Brave</b>, <b>brave</b>, <b>brave</b>, <b>brave</b> " "<b>Sir</b> <b>Robin</b>", )
def test_headline(self): searched = Line.objects.annotate(headline=SearchHeadline( F('dialogue'), SearchQuery('brave sir robin'), config=SearchConfig('english'), ), ).get(pk=self.verse0.pk) self.assertEqual( searched.headline, '<b>Robin</b>. He was not at all afraid to be killed in nasty ' 'ways. <b>Brave</b>, <b>brave</b>, <b>brave</b>, <b>brave</b> ' '<b>Sir</b> <b>Robin</b>', )
def test_headline_separator_options(self): searched = Line.objects.annotate(headline=SearchHeadline( 'dialogue', 'brave sir robin', start_sel='<span>', stop_sel='</span>', ), ).get(pk=self.verse0.pk) self.assertEqual( searched.headline, '<span>Robin</span>. He was not at all afraid to be killed in ' 'nasty ways. <span>Brave</span>, <span>brave</span>, <span>brave' '</span>, <span>brave</span> <span>Sir</span> <span>Robin</span>', )
def search( self, search_text, rank=True, prefix=True, highlight=False, force_or=False ): if search_text is None: return self if prefix: if force_or: conj = " | " else: conj = " & " if not " or " in search_text.lower() else " | " search_text = conj.join(split_proximity(search_text)) search_query = SearchQuery(search_text, config="german", search_type="raw") else: search_query = SearchQuery( search_text, config="german", search_type="websearch" ) qs = self.filter(search_vector=search_query) if rank: qs = qs.annotate( rank=SearchRank(F("search_vector"), search_query) ).order_by("-rank") if highlight: qs = qs.annotate( description_highlighted=SearchHeadline( "description", search_query, config="german", highlight_all=True ), title_highlighted=SearchHeadline( "title", search_query, config="german", highlight_all=True ), ) return qs
def test_headline_fragments_words_options(self): searched = Line.objects.annotate(headline=SearchHeadline( 'dialogue', SearchQuery('brave sir robin', config='english'), fragment_delimiter='...<br>', max_fragments=4, max_words=3, min_words=1, ), ).get(pk=self.verse0.pk) self.assertEqual( searched.headline, '<b>Sir</b> <b>Robin</b>, rode...<br>' '<b>Brave</b> <b>Sir</b> <b>Robin</b>...<br>' '<b>Brave</b>, <b>brave</b>, <b>brave</b>...<br>' '<b>brave</b> <b>Sir</b> <b>Robin</b>', )
def test_headline_fragments_words_options(self): searched = Line.objects.annotate(headline=SearchHeadline( "dialogue", SearchQuery("brave sir robin", config="english"), fragment_delimiter="...<br>", max_fragments=4, max_words=3, min_words=1, ), ).get(pk=self.verse0.pk) self.assertEqual( searched.headline, "<b>Sir</b> <b>Robin</b>, rode...<br>" "<b>Brave</b> <b>Sir</b> <b>Robin</b>...<br>" "<b>Brave</b>, <b>brave</b>, <b>brave</b>...<br>" "<b>brave</b> <b>Sir</b> <b>Robin</b>", )
def get_queryset(self): q = self.request.query_params.get("q") return SearchIndex.objects\ .filter(part__in=models.Subquery(Part.objects.effective(date.today()).values("id")))\ .filter(search_vector=SearchQuery(q))\ .annotate(rank=SearchRank("search_vector", SearchQuery(q)))\ .annotate( headline=SearchHeadline( "content", SearchQuery(q), start_sel='<span class="search-highlight">', stop_sel='</span>', ), )\ .order_by('-rank')\ .values("type", "content", "headline", "label", "parent", "part__document__title", "part__title", "part__date")
def get(self, request, *args, **kwargs): serialized = SearchSerializer(data=request.GET) if serialized.is_valid(): search_posts = Post.objects.filter( search_vector=serialized.validated_data['s']).annotate( headline=SearchHeadline( 'text', SearchQuery(serialized.validated_data['s']), )).all() if search_posts: serialized_post = PostSerializer(search_posts, many=True) return Response(serialized_post.data) return Response([])
def search_postgres(term, search_document=True, search_comments=True, content_types=None): try: from django.contrib.postgres.search import SearchHeadline except ImportError: from web.compat import SearchHeadline if search_document is None and search_comments is None: return SearchIndex.objects.none() term = unaccent(term) query = SearchQuery(term, config=settings.LANGUAGE_CODE, search_type='websearch') field = 'combined_search_vector' if not search_document: field = 'comments_search_vector' if not search_comments: field = 'document_search_vector' rank = SearchRank(F(field), query) highlights = {} for f in ['title', 'document', 'comments']: if f == 'document' and not search_document: continue if f == 'comments' and not search_comments: continue highlights['highlighted_' + f] = SearchHeadline( f, query, config=F('language_code'), start_sel=SearchIndex.HIGHLIGHT_START, stop_sel=SearchIndex.HIGHLIGHT_STOP, highlight_all=True) fields = {field: query} if content_types is not None: fields['content_type__in'] = content_types return (SearchIndex.objects.filter(**fields).annotate( rank=rank, **highlights).only('content_type', 'object_id', 'language_code', 'created', 'updated', 'author', 'authors_name', 'title').order_by('-rank'))
def index(request): q = request.GET.get('q') if q: vector = SearchVector('title', 'description') query = SearchQuery(q) search_headline = SearchHeadline('description', query) #videos = Video.objects.filter(title__search=q) #videos = Video.objects.annotate(search=vector).filter(search=query) #videos = Video.objects.annotate(rank=SearchRank(vector, query)).filter(rank__gte=0.001).order_by('-rank') videos = Video.objects.annotate( rank=SearchRank(vector, query)).annotate( headline=search_headline).filter( rank__gte=0.001).order_by('-rank') else: videos = None context = {'videos': videos} return render(request, 'example/index.html', context)
def search(self, query): return self\ .annotate(rank=SearchRank( SearchVector('label', weight='A') + SearchVector(models.functions.Concat('label__0', models.Value('.'), 'label__1'), weight='A') + SearchVector('parent__title', weight='A') + SearchVector('part__document__title', weight='B') + SearchVector('content', weight='B'), SearchQuery(query)) )\ .filter(rank__gte=0.2)\ .annotate( headline=SearchHeadline( "content", SearchQuery(query), start_sel='<span class="search-highlight">', stop_sel='</span>', ), )\ .order_by('-rank')\ .prefetch_related('part')
def search(request): search_form = SearchForm(request.GET) if not search_form.is_valid(): return render(request, "search.html", {"search_form": search_form}) term = search_form.cleaned_data["q"] filter_section = search_form.cleaned_data["section"] filter_lang = search_form.cleaned_data["lang"] filter_repo = search_form.cleaned_data["repo"] filter_pkgname = search_form.cleaned_data["pkgname"] # handle quick search go = search_form.cleaned_data["go"] if term and go == "Go" and len(filter_repo) <= 1: name_section_lang = term if filter_section: name_section_lang += filter_section if filter_lang: name_section_lang += filter_lang response = quick_search(repo=filter_repo[0] if len(filter_repo) == 1 else None, pkgname=filter_pkgname or None, name_section_lang=name_section_lang) if response: return response man_filter = Q() pkg_filter = Q() if filter_section: assert isinstance(filter_section, list) section_parts = [] for q in filter_section: # do prefix search only when given a single letter (e.g. "3p" should not match "3perl", "3python" etc.) if len(q) == 1: section_parts.append(Q(section__startswith=q)) else: section_parts.append(Q(section__iexact=q)) man_filter &= reduce(operator.__or__, section_parts) if filter_lang: assert isinstance(filter_lang, list) man_filter &= reduce(operator.__or__, (Q(lang__startswith=q) for q in filter_lang)) if filter_repo: assert isinstance(filter_repo, list) man_filter &= Q(package__repo__in=filter_repo) pkg_filter &= Q(repo__in=filter_repo) if filter_pkgname: man_filter &= Q(package__name__iexact=filter_pkgname) pkg_filter &= Q(name__iexact=filter_pkgname) # this is only because we cannot use .annotate() inside the union (Django would add another column) symlink_filter = copy.deepcopy(man_filter) def build_symlink_filter(q): for i in range(len(q.children)): if isinstance(q.children[i], Q): build_symlink_filter(q.children[i]) continue key, value = q.children[i] if key.startswith("section__"): key = "from_" + key q.children[i] = (key, value) build_symlink_filter(symlink_filter) man_results = ManPage.objects.values("name", "section", "lang", "package__repo", "package__name") \ .filter(name__trigram_similar=term).filter(man_filter) \ .annotate(similarity=TrigramSimilarity("name", term)) \ .union(SymbolicLink.objects.values("from_name", "from_section", "lang", "package__repo", "package__name") .filter(from_name__trigram_similar=term).filter(symlink_filter) .annotate(similarity=TrigramSimilarity("from_name", term)), all=True) \ .order_by("-similarity", "name", "section", "lang", "package__name", "package__repo") # full-text search objects: https://docs.djangoproject.com/en/3.1/ref/contrib/postgres/search/ ts_query = SearchQuery(term) ts_vector = SearchVector("description", config="english") ts_headline = SearchHeadline("description", ts_query, start_sel="<b>", stop_sel="</b>") #ts_rank = SearchRank(ts_vector, ts_query, normalization=32) ts_sim_rank = TrigramSimilarity("name", term) + 2 * SearchRank(ts_vector, ts_query, normalization=32) # get table names for the models (needed for raw SQL) package_table = Package.objects.model._meta.db_table content_table = Content.objects.model._meta.db_table manpage_table = ManPage.objects.model._meta.db_table # build the WHERE clause (ugh) apropos_filter_conditions, apropos_filter_values = build_apropos_filter(man_filter) if apropos_filter_conditions: apropos_filter = f"WHERE {apropos_filter_conditions}" else: apropos_filter = "" # For the search in man page descriptions ("apropos") we need to perform a raw SQL query, # because it is not possible to express the same query with Django ORM. # Notes: # - the subquery (i.e. INNER JOIN (...) AS subquery) is necessary for good performance # - INNER JOIN instead of LEFT OUTER JOIN is needed on the subquery, otherwise PostgreSQL # will not use the GIN index # - WITH is used for convenience to avoid repeating the ts_rank expression in the WHERE clause # https://www.postgresql.org/docs/current/queries-with.html content_results = f""" WITH content_search AS ( SELECT "{content_table}"."id", ts_headline("{content_table}"."description", plainto_tsquery(%s), 'StartSel=''<b>'', StopSel=''</b>''') AS "desc_snippet", ts_rank(to_tsvector('english'::regconfig, COALESCE("{content_table}"."description", '')), plainto_tsquery(%s), 32) AS "rank", to_tsvector('english'::regconfig, COALESCE("{content_table}"."description", '')) AS "search" FROM "{content_table}" ) SELECT * FROM "content_search" WHERE "search" @@ plainto_tsquery(%s) AND "rank" > 0.001""" apropos_results = ManPage.objects.raw(f""" SELECT "{manpage_table}"."id", "{manpage_table}"."name", "{manpage_table}"."section", "{manpage_table}"."lang", "{package_table}"."repo" AS "package__repo", "{package_table}"."name" AS "package__name", "desc_snippet", "rank" FROM "{manpage_table}" INNER JOIN "{package_table}" ON ("{manpage_table}"."package_id" = "{package_table}"."id") INNER JOIN ({content_results}) AS subquery ON ("{manpage_table}"."converted_content_id" = "subquery"."id") {apropos_filter} ORDER BY "rank" DESC, "{manpage_table}"."name" ASC, "{manpage_table}"."section" ASC, "{manpage_table}"."lang" ASC, "package__name" ASC, "package__repo" ASC""", [term, term, term] + apropos_filter_values) # NOTE: Some other things that were tried with Django ORM (as of Django 3.1): # 1. We could do this if we did not need a subquery (this works, but is slow): # apropos_results = ManPage.objects.values("name", "section", "lang", "package__repo", "package__name", "converted_content__description").extra( # select={ # "desc_snippet": f"ts_headline('english', COALESCE({content_table}.description, ''), plainto_tsquery(%s))", # "rank": f"ts_rank(to_tsvector('english', COALESCE({content_table}.description, '')), plainto_tsquery(%s), 32)", # }, # where=[f"to_tsvector('english', COALESCE({content_table}.description, '')) @@ plainto_tsquery(%s)"], # params=[term], # select_params=[term, term], # order_by=("-rank", "name", "section", "lang", "package__name", "package__repo"), # ) # # 2. A mostly equivalent query in pure Django ORM syntax (better parametrization, still no subquery, same performance): # from django.db.models import F # apropos_results = ManPage.objects.values("name", "section", "lang", "package__repo", "package__name", "converted_content__description") \ # .annotate(description=F("converted_content__description")) \ # .annotate(desc_snippet=ts_headline) \ # .annotate(rank=ts_rank) \ # .annotate(search=ts_vector) \ # .filter(search=ts_query) \ # .order_by("-rank", "name", "section", "lang", "package__name", "package__repo") # 3. We can define the subquery like this, but the real question is how to use it: # content_results = Content.objects.only("id") \ # .annotate(desc_snippet=ts_headline) \ # .annotate(rank=ts_rank) \ # .annotate(search=ts_vector) \ # .filter(search=ts_query) # Also note that we can't use the subquery even in the plain-text for a raw SQL query, # because the ".query" attribute strips '' from the COALESCE function. [WTF!!!] # 3a) Django supports subqueries like this: https://docs.djangoproject.com/en/3.1/ref/models/expressions/#subquery-expressions # SELECT "post"."id", ( # SELECT U0."email" # FROM "comment" U0 # WHERE U0."post_id" = ("post"."id") # ORDER BY U0."created_at" DESC LIMIT 1 # ) AS "newest_commenter_email" FROM "post" # But this is not applicable here, because the subquery *must* return exactly one column # (otherwise it is an SQL syntax error). Anyway, the code (which does not work) would # be more or less like this: # from django.db.models import OuterRef, Subquery # content_results = Content.objects.only("id") \ # .annotate(desc_snippet=ts_headline) \ # .annotate(rank=ts_rank) \ # .annotate(search=ts_vector) \ # .filter(Q(search=ts_query) & Q(id=OuterRef("converted_content_id"))) # this is basically the join condition # apropos_results = ManPage.objects.values("name", "section", "lang", "package__repo", "package__name") \ # .annotate(content_subquery=Subquery(content_results)) \ # .order_by("-rank", "name", "section", "lang", "package__name", "package__repo") # 3b) Django supports joins with simple subqueries via FilteredRelation objects, but it # does not work with arbitrary subqueries, especially subqueries which add additional # columns (like our "desc_snippet" and "rank"). # https://docs.djangoproject.com/en/3.1/ref/models/querysets/#filteredrelation-objects # Note: the "Q" objects allow more complicated expressions in the filter: # https://docs.djangoproject.com/en/3.1/topics/db/queries/#complex-lookups-with-q pkg_results = Package.objects.only("repo", "name") \ .annotate(desc_snippet=ts_headline) \ .annotate(rank=ts_sim_rank) \ .annotate(search=ts_vector) \ .filter(pkg_filter) \ .filter(Q(name__trigram_similar=term) | Q(search=ts_query)) \ .order_by("-rank", "name", "repo") man_results = paginate(request, "page_man", man_results, 20) apropos_results = paginate(request, "page_apropos", apropos_results, 20) pkg_results = paginate(request, "page_pkg", pkg_results, 20) context = { "search_form": search_form, "man_results": man_results, "apropos_results": apropos_results, "pkg_results": pkg_results, } return render(request, "search.html", context)