def handle(self, *args, **options): now = utc_now() verbose = int(options['verbosity']) > 1 qs = BlogItemHits.objects.filter(hits__gt=0) for hit in qs.values('oid', 'hits'): # This is totally arbitrary! # I'm using hits and number of comments as a measure of # how is should be ranked. # The thinking is that posts that are found and read are # likely to be more popular and should thus be ranked # higher. plogrank = hit['hits'] comments = ( BlogComment.objects .filter(blogitem__oid=hit['oid']).count() ) # multiple by a factor to make this slightly more significant plogrank += comments * 10 ( BlogItem.objects .filter(oid=hit['oid']) .update(plogrank=plogrank) ) if verbose: print str(plogrank).rjust(7), '\t', hit['oid']
def test_blog_post_caching(self): blog = BlogItem.objects.create( oid="some-longish-test-post", title="TITLEX", text="BLABLABLA", display_format="structuredtext", pub_date=utc_now() - datetime.timedelta(seconds=10), ) url = reverse("blog_post", args=[blog.oid]) import peterbecom.plog.views old_render = peterbecom.plog.views.render from django.shortcuts import render as django_render render_counts = [] def mocked_render(*a, **k): render_counts.append(1) return django_render(*a, **k) peterbecom.plog.views.render = mocked_render try: response = self.client.get(url) content = response.content.decode("utf-8") assert blog.title in content assert "0 comments" in content response = self.client.get(url) content = response.content.decode("utf-8") assert "0 comments" in content BlogComment.objects.create( comment="textext", blogitem=blog, approved=True, add_date=utc_now() + datetime.timedelta(seconds=1), ) response = self.client.get(url) content = response.content.decode("utf-8") assert "1 comment" in content finally: peterbecom.plog.views.render = old_render
def test_blog_post_caching(self): blog = BlogItem.objects.create( oid='some-longish-test-post', title='TITLEX', text='BLABLABLA', display_format='structuredtext', pub_date=utc_now() - datetime.timedelta(seconds=10), ) url = reverse('blog_post', args=[blog.oid]) import peterbecom.plog.views old_render = peterbecom.plog.views.render from django.shortcuts import render as django_render render_counts = [] def mocked_render(*a, **k): render_counts.append(1) return django_render(*a, **k) peterbecom.plog.views.render = mocked_render try: response = self.client.get(url) self.assertTrue(blog.title in response.content) assert '0 comments' in response.content response = self.client.get(url) assert '0 comments' in response.content BlogComment.objects.create( comment="textext", blogitem=blog, approved=True, add_date=utc_now() + datetime.timedelta(seconds=1), ) response = self.client.get(url) assert '1 comment' in response.content finally: peterbecom.plog.views.render = old_render assert len(render_counts) == 2, render_counts
def test_old_redirects(self): blog = BlogItem.objects.create( oid='myoid', title='TITLEX', text=""" ttest test """, display_format='structuredtext', pub_date=utc_now() - datetime.timedelta(seconds=10), ) url = reverse('blog_post', args=[blog.oid]) response = self.client.get(url) assert response.status_code == 200 response = self.client.get(url, {'replypath': 'foo'}) self.assertEqual(response.status_code, 301) self.assertEqual(urlparse(response['location']).path, url) self.assertTrue(not urlparse(response['location']).query)
def get_data(max_length=1000, pub_date_format=None, offset=0): items = [] category_names = dict((x.id, x.name) for x in Category.objects.all()) categories = defaultdict(list) for e in BlogItem.categories.through.objects.all(): categories[e.blogitem_id].append(category_names[e.category_id]) qs = BlogItem.objects.filter(pub_date__lt=utc_now()).order_by("-pub_date") for item in qs[offset:max_length]: pub_date = item.pub_date if pub_date_format: pub_date = pub_date_format(pub_date) items.append({ "title": item.title, "slug": item.oid, "pub_date": pub_date, "keywords": [x for x in item.proper_keywords if x][:3], "categories": categories[item.id][:3], }) return items
def test_old_redirects(self): blog = BlogItem.objects.create( oid="myoid", title="TITLEX", text=""" ttest test """, display_format="structuredtext", pub_date=utc_now() - datetime.timedelta(seconds=10), ) url = reverse("blog_post", args=[blog.oid]) response = self.client.get(url) assert response.status_code == 200 response = self.client.get(url, {"replypath": "foo"}) assert response.status_code == 301 assert urlparse(response["location"]).path == url assert not urlparse(response["location"]).query
def test_blog_post_ping(self): blog = BlogItem.objects.create( oid="myoid", title="TITLEX", text=""" ttest test """, display_format="structuredtext", pub_date=utc_now() - datetime.timedelta(seconds=10), ) url = reverse("blog_post_ping", args=[blog.oid]) response = self.client.get(url) assert response.status_code == 405 response = self.client.put(url) assert response.status_code == 200 assert response.json()["ok"] hit, = BlogItemHit.objects.all() assert hit.blogitem == blog
def get_data(max_length=1000, pub_date_format=None, offset=0): items = [] category_names = dict((x.id, x.name) for x in Category.objects.all()) categories = defaultdict(list) for e in BlogItem.categories.through.objects.all(): categories[e.blogitem_id].append( category_names[e.category_id] ) qs = BlogItem.objects.filter(pub_date__lt=utc_now()).order_by('-pub_date') for item in qs[offset:max_length]: pub_date = item.pub_date if pub_date_format: pub_date = pub_date_format(pub_date) items.append({ 'title': item.title, 'slug': item.oid, 'pub_date': pub_date, 'keywords': [x for x in item.proper_keywords if x][:3], 'categories': categories[item.id][:3], }) return items
def home(request, oc=None): context = {} qs = BlogItem.objects.filter(pub_date__lt=utc_now()) if oc is not None: if not oc: # empty string return redirect('/', permanent=True) categories = parse_ocs_to_categories(oc) cat_q = make_categories_q(categories) qs = qs.filter(cat_q) context['categories'] = categories # Reasons for not being here if request.method == 'HEAD': return http.HttpResponse('') BATCH_SIZE = 10 try: page = max(1, int(request.GET.get('page', 1))) - 1 except ValueError: raise http.Http404('invalid page value') n, m = page * BATCH_SIZE, (page + 1) * BATCH_SIZE max_count = qs.count() first_post, = qs.order_by('-pub_date')[:1] context['first_post_url'] = request.build_absolute_uri( reverse('blog_post', args=[first_post.oid]) ) if (page + 1) * BATCH_SIZE < max_count: context['next_page'] = page + 2 context['previous_page'] = page context['blogitems'] = ( qs .prefetch_related('categories') .order_by('-pub_date') )[n:m] if page > 0: # page starts on 0 context['page_title'] = 'Page {}'.format(page + 1) return render(request, 'homepage/home.html', context)
def test_blog_post_with_comment_approval(self): blog = BlogItem.objects.create( oid='some-longish-test-post', title='TITLEX', text='BLABLABLA', display_format='structuredtext', pub_date=utc_now() - datetime.timedelta(seconds=10), ) url = reverse('blog_post', args=[blog.oid]) self._login() loggedin = self.client anonymous = Client() assert len(loggedin.cookies) assert not len(anonymous.cookies) comment = BlogComment.objects.create( oid='a1000', blogitem=blog, comment='COMMENTX', name='Mr Anonymous', ) # but it hasn't been approved yet response = anonymous.get(url) self.assertEqual(response.status_code, 200) self.assertTrue('COMMENTX' not in response.content) # let's approve it! approve_url = reverse('approve_comment', args=[blog.oid, comment.oid]) response = loggedin.post( approve_url, HTTP_X_REQUESTED_WITH='XMLHttpRequest' ) self.assertEqual(response.status_code, 200) self.assertEqual(response.content, 'OK') response = anonymous.get(url) self.assertEqual(response.status_code, 200) self.assertTrue('COMMENTX' in response.content)
def handle(self, *args, **options): if cache.get('nodomains-queued'): return queued = models.Queued.objects.filter(failed_attempts__lt=5) for queued in queued.order_by('add_date'): cache.set('nodomains-queued', True, 100) try: then = utc_now() - datetime.timedelta(days=1) models.Result.objects.get( url=queued.url, add_date__gt=then ) print "Skipping", queued.url except models.Result.DoesNotExist: print queued.url try: run_url(queued.url) except Exception: queued.failed_attempts += 1 queued.save() continue queued.delete() cache.delete('nodomains-queued')
def test_text_rendering_with_images(self): blog = BlogItem.objects.create( oid='myoid', title='TITLEX', text=""" "image.png":/plog/myoid/image.png and *this* """, display_format='structuredtext', pub_date=utc_now() - datetime.timedelta(seconds=10), ) url = reverse('blog_post', args=[blog.oid]) response = self.client.get(url) content = response.content self.assertTrue('<em>this</em>' in content) regex_str = ( '/CONTENTCACHE-\d+%s' % (re.escape('/plog/myoid/image.png'),) ) self.assertTrue(re.findall(regex_str, content)) old = settings.STATIC_URL settings.STATIC_URL = '//some.cdn.com/' try: blog.text_rendered = '' blog.save() response = self.client.get(url) content = response.content regex_str = ( '%sCONTENTCACHE-\d+%s' % ( settings.STATIC_URL, re.escape('/plog/myoid/image.png') ) ) self.assertTrue(re.findall(regex_str, content)) finally: settings.STATIC_URL = old
def timesince(date): if date.tzinfo: return smartertimesince(date, utc_now()) else: return smartertimesince(date)
def items(self, categories): qs = (BlogItem.objects .filter(pub_date__lt=utc_now())) if categories: qs = qs.filter(make_categories_q(categories)) return qs.order_by('-pub_date')[:10]
def sitemap(request): base_url = "https://%s" % RequestSite(request).domain urls = [] urls.append('<?xml version="1.0" encoding="iso-8859-1"?>') urls.append('<urlset xmlns="http://www.google.com/schemas/sitemap/0.84">') def add(loc, lastmod=None, changefreq="monthly", priority=None): url = "<url><loc>%s%s</loc>" % (base_url, loc) if lastmod: url += "<lastmod>%s</lastmod>" % lastmod.strftime("%Y-%m-%d") if priority: url += "<priority>%s</priority>" % priority if changefreq: url += "<changefreq>%s</changefreq>" % changefreq url += "</url>" urls.append(url) now = utc_now() latest_blogitem, = BlogItem.objects.filter( pub_date__lt=now).order_by("-pub_date")[:1] add("/", priority=1.0, changefreq="daily", lastmod=latest_blogitem.pub_date) add(reverse("about"), changefreq="weekly", priority=0.5) add(reverse("contact"), changefreq="weekly", priority=0.5) # TODO: Instead of looping over BlogItem, loop over # BlogItemTotalHits and use the join to build this list. # Then we can sort by a scoring function. # This will only work once ALL blogitems have at least 1 hit. blogitems = BlogItem.objects.filter(pub_date__lt=now) for blogitem in blogitems.order_by("-pub_date"): if not blogitem.modify_date: # legacy! try: latest_comment, = BlogComment.objects.filter( approved=True, blogitem=blogitem).order_by("-add_date")[:1] blogitem.modify_date = latest_comment.add_date except ValueError: blogitem.modify_date = blogitem.pub_date blogitem._modify_date_set = True blogitem.save() age = (now - blogitem.modify_date).days if age < 14: changefreq = "daily" elif age < 60: changefreq = "weekly" elif age < 100: changefreq = "monthly" else: changefreq = None add( reverse("blog_post", args=[blogitem.oid]), lastmod=blogitem.modify_date, changefreq=changefreq, ) urls.append("</urlset>") return http.HttpResponse("\n".join(urls), content_type="text/xml")
def home(request, oc=None, page=1): context = {} qs = BlogItem.objects.filter(pub_date__lt=utc_now()) if oc is not None: if not oc: # empty string return redirect("/", permanent=True) categories = parse_ocs_to_categories(oc, strict_matching=True) cat_q = make_categories_q(categories) qs = qs.filter(cat_q) context["categories"] = categories # Reasons for not being here if request.method == "HEAD": return http.HttpResponse("") batch_size = settings.HOMEPAGE_BATCH_SIZE try: page = max(1, int(page)) - 1 except ValueError: raise http.Http404("invalid page value") n, m = page * batch_size, (page + 1) * batch_size max_count = qs.count() if page * batch_size > max_count: return http.HttpResponse("Too far back in time\n", status=404) if (page + 1) * batch_size < max_count: context["next_page"] = page + 2 context["previous_page"] = page # If you're going deep into the pagination with some really old # pages, it's not worth using the fs cache because if you have to # store a fs cache version for every single page from p5 to p55 # it's too likely to get stale and old and it's too much work # on the mincss postprocess. if page > 6 or (context.get("categories") and page > 2): request._fscache_disable = True if context.get("categories"): oc_path = "/".join( ["oc-{}".format(c.name) for c in context["categories"]]) oc_path = oc_path[3:] if context.get("next_page"): if context.get("categories"): next_page_url = reverse("only_category_paged", args=(oc_path, context["next_page"])) else: next_page_url = reverse("home_paged", args=(context["next_page"], )) context["next_page_url"] = next_page_url if context["previous_page"] > 1: if context.get("categories"): previous_page_url = reverse("only_category_paged", args=(oc_path, context["previous_page"])) else: previous_page_url = reverse("home_paged", args=(context["previous_page"], )) context["previous_page_url"] = previous_page_url elif context["previous_page"]: # i.e. == 1 if context.get("categories"): previous_page_url = reverse("only_category", args=(oc_path, )) else: previous_page_url = "/" context["previous_page_url"] = previous_page_url context["blogitems"] = ( qs.prefetch_related("categories").order_by("-pub_date"))[n:m] if page > 0: # page starts on 0 context["page_title"] = "Page {}".format(page + 1) approved_comments_count = {} blog_comments_count_qs = (BlogComment.objects.filter( blogitem__in=context["blogitems"], approved=True).values("blogitem_id").annotate( count=Count("blogitem_id"))) for count in blog_comments_count_qs: approved_comments_count[count["blogitem_id"]] = count["count"] context["approved_comments_count"] = approved_comments_count return render(request, "homepage/home.html", context)
def test_homepage_cache_rendering(self): url = reverse("home") blog1 = BlogItem.objects.create( title="TITLE1", text="BLABLABLA", display_format="structuredtext", pub_date=utc_now() - datetime.timedelta(seconds=10), ) BlogComment.objects.create(oid="c1", comment="textext", blogitem=blog1, approved=True) BlogComment.objects.create(oid="c2", comment="tuxtuxt", blogitem=blog1, approved=True) response = self.client.get(url) content = response.content.decode("utf-8") self.assertTrue("TITLE1" in content) self.assertTrue("2 comments" in content) blog1.title = "TUTLE1" blog1.save() response = self.client.get(url) content = response.content.decode("utf-8") self.assertTrue("TUTLE1" in content) blog2 = BlogItem.objects.create( oid="t2", title="TATLE2", text="BLEBLE", display_format="structuredtext", pub_date=utc_now() - datetime.timedelta(seconds=1), ) response = self.client.get(url) content = response.content.decode("utf-8") self.assertTrue("TATLE2" in content) self.assertTrue("0 comments" in content) self.assertTrue("TUTLE1" in content) self.assertTrue("2 comments" in content) # by categories only cat1 = Category.objects.create(name="CATEGORY1") cat2 = Category.objects.create(name="CATEGORY2") blog1.categories.add(cat1) blog1.save() blog2.categories.add(cat2) blog2.save() response = self.client.get(url) content = response.content.decode("utf-8") self.assertTrue("CATEGORY1" in content) self.assertTrue("CATEGORY2" in content) url = reverse("only_category", args=["CATEGORY2"]) response = self.client.get(url) content = response.content.decode("utf-8") self.assertTrue("CATEGORY1" not in content) self.assertTrue("CATEGORY2" in content) url = reverse("only_category", args=["CATEGORY1"]) response = self.client.get(url) content = response.content.decode("utf-8") self.assertTrue("CATEGORY1" in content) self.assertTrue("CATEGORY2" not in content) for i in range(2, 21): BlogItem.objects.create( oid="t-%s" % i, title="TITLE-%s" % i, text="BLEBLE", display_format="structuredtext", pub_date=utc_now() - datetime.timedelta(seconds=20 + i), ) url = reverse("home") response = self.client.get(url) content = response.content.decode("utf-8") assert "/p2" in content visible_titles = [] not_visible_titles = [] for item in BlogItem.objects.all(): if item.title in content: visible_titles.append(item.title) else: not_visible_titles.append(item.title) url = reverse("home_paged", args=(2, )) response = self.client.get(url) content = response.content.decode("utf-8") batch_size = settings.HOMEPAGE_BATCH_SIZE for each in visible_titles[:batch_size]: assert each not in content for each in not_visible_titles[:batch_size]: assert each in content assert "/p3" in content
def test_homepage_cache_rendering(self): url = reverse('home') blog1 = BlogItem.objects.create( title='TITLE1', text='BLABLABLA', display_format='structuredtext', pub_date=utc_now() - datetime.timedelta(seconds=10), ) BlogComment.objects.create( oid='c1', comment="textext", blogitem=blog1, approved=True, ) BlogComment.objects.create( oid='c2', comment="tuxtuxt", blogitem=blog1, approved=True, ) response = self.client.get(url) self.assertTrue('TITLE1' in response.content) self.assertTrue('2 comments' in response.content) blog1.title = 'TUTLE1' blog1.save() response = self.client.get(url) self.assertTrue('TUTLE1' in response.content) blog2 = BlogItem.objects.create( oid='t2', title='TATLE2', text='BLEBLE', display_format='structuredtext', pub_date=utc_now() - datetime.timedelta(seconds=1), ) response = self.client.get(url) self.assertTrue('TATLE2' in response.content) self.assertTrue('0 comments' in response.content) self.assertTrue('TUTLE1' in response.content) self.assertTrue('2 comments' in response.content) # by categories only cat1 = Category.objects.create( name='CATEGORY1', ) cat2 = Category.objects.create( name='CATEGORY2', ) blog1.categories.add(cat1) blog1.save() blog2.categories.add(cat2) blog2.save() response = self.client.get(url) self.assertTrue('CATEGORY1' in response.content) self.assertTrue('CATEGORY2' in response.content) url = reverse('only_category', args=['CATEGORY2']) response = self.client.get(url) self.assertTrue('CATEGORY1' not in response.content) self.assertTrue('CATEGORY2' in response.content) url = reverse('only_category', args=['CATEGORY1']) response = self.client.get(url) self.assertTrue('CATEGORY1' in response.content) self.assertTrue('CATEGORY2' not in response.content) for i in range(2, 21): BlogItem.objects.create( oid='t-%s' % i, title='TITLE-%s' % i, text='BLEBLE', display_format='structuredtext', pub_date=utc_now() - datetime.timedelta(seconds=20 + i), ) url = reverse('home') response = self.client.get(url) assert '?page=2' in response.content visible_titles = [] not_visible_titles = [] for item in BlogItem.objects.all(): if item.title in response.content: visible_titles.append(item.title) else: not_visible_titles.append(item.title) response = self.client.get(url, {'page': 2}) for each in visible_titles[:10]: assert each not in response.content for each in not_visible_titles[:10]: assert each in response.content assert '?page=1' in response.content assert '?page=3' in response.content
def sitemap(request): base_url = 'https://%s' % RequestSite(request).domain urls = [] urls.append('<?xml version="1.0" encoding="iso-8859-1"?>') urls.append('<urlset xmlns="http://www.google.com/schemas/sitemap/0.84">') def add(loc, lastmod=None, changefreq='monthly', priority=None): url = '<url><loc>%s%s</loc>' % (base_url, loc) if lastmod: url += '<lastmod>%s</lastmod>' % lastmod.strftime('%Y-%m-%d') if priority: url += '<priority>%s</priority>' % priority if changefreq: url += '<changefreq>%s</changefreq>' % changefreq url += '</url>' urls.append(url) now = utc_now() latest_blogitem, = ( BlogItem.objects .filter(pub_date__lt=now) .order_by('-pub_date')[:1] ) add( '/', priority=1.0, changefreq='daily', lastmod=latest_blogitem.pub_date ) add(reverse('about'), changefreq='weekly', priority=0.5) add(reverse('contact'), changefreq='weekly', priority=0.5) for blogitem in (BlogItem.objects .filter(pub_date__lt=now) .order_by('-pub_date')[:1000]): if not blogitem.modify_date: # legacy! try: latest_comment, = ( BlogComment.objects .filter(approved=True, blogitem=blogitem) .order_by('-add_date')[:1] ) blogitem.modify_date = latest_comment.add_date except ValueError: blogitem.modify_date = blogitem.pub_date blogitem._modify_date_set = True blogitem.save() age = (now - blogitem.modify_date).days if age < 14: changefreq = 'daily' elif age < 60: changefreq = 'weekly' elif age < 100: changefreq = 'monthly' else: changefreq = None add(reverse('blog_post', args=[blogitem.oid]), lastmod=blogitem.modify_date, changefreq=changefreq ) urls.append('</urlset>') return http.HttpResponse('\n'.join(urls), content_type="text/xml")
def home(request, oc=None, page=1): context = {} qs = BlogItem.objects.filter(pub_date__lt=utc_now()) if oc is not None: if not oc: # empty string return redirect("/", permanent=True) categories = parse_ocs_to_categories(oc, strict_matching=True) cat_q = make_categories_q(categories) qs = qs.filter(cat_q) context["categories"] = categories # Reasons for not being here if request.method == "HEAD": return http.HttpResponse("") batch_size = settings.HOMEPAGE_BATCH_SIZE try: page = max(1, int(page)) - 1 except ValueError: raise http.Http404("invalid page value") n, m = page * batch_size, (page + 1) * batch_size max_count = qs.count() if page * batch_size > max_count: return http.HttpResponse("Too far back in time\n", status=404) if (page + 1) * batch_size < max_count: context["next_page"] = page + 2 context["previous_page"] = page # If you're going deep into the pagination with some really old # pages, it's not worth using the fs cache because if you have to # store a fs cache version for every single page from p5 to p55 # it's too likely to get stale and old and it's too much work # on the mincss postprocess. if page > 6 or (context.get("categories") and page > 2): request._fscache_disable = True if context.get("categories"): oc_path = "/".join(["oc-{}".format(c.name) for c in context["categories"]]) oc_path = oc_path[3:] if context.get("next_page"): if context.get("categories"): next_page_url = reverse( "only_category_paged", args=(oc_path, context["next_page"]) ) else: next_page_url = reverse("home_paged", args=(context["next_page"],)) context["next_page_url"] = next_page_url if context["previous_page"] > 1: if context.get("categories"): previous_page_url = reverse( "only_category_paged", args=(oc_path, context["previous_page"]) ) else: previous_page_url = reverse("home_paged", args=(context["previous_page"],)) context["previous_page_url"] = previous_page_url elif context["previous_page"]: # i.e. == 1 if context.get("categories"): previous_page_url = reverse("only_category", args=(oc_path,)) else: previous_page_url = "/" context["previous_page_url"] = previous_page_url context["blogitems"] = (qs.prefetch_related("categories").order_by("-pub_date"))[ n:m ] if page > 0: # page starts on 0 context["page_title"] = "Page {}".format(page + 1) approved_comments_count = {} blog_comments_count_qs = ( BlogComment.objects.filter(blogitem__in=context["blogitems"], approved=True) .values("blogitem_id") .annotate(count=Count("blogitem_id")) ) for count in blog_comments_count_qs: approved_comments_count[count["blogitem_id"]] = count["count"] context["approved_comments_count"] = approved_comments_count return render(request, "homepage/home.html", context)
def search(request): data = {} search = request.GET.get('q', '') if len(search) > 90: return http.HttpResponse("Search too long") documents = [] data['base_url'] = 'https://%s' % RequestSite(request).domain tag_strip = re.compile('<[^>]+>') def append_match(item, words): text = item.rendered text = tag_strip.sub(' ', text) sentences = [] def matcher(match): return '<b>%s</b>' % match.group() if regex: for each in regex.finditer(text): sentence = text[max(each.start() - 35, 0): each.end() + 40] sentence = regex_ext.sub(matcher, sentence) sentence = sentence.strip() if each.start() > 0 and not sentence[0].isupper(): sentence = '...%s' % sentence if each.end() < len(text): sentence = '%s...' % sentence sentences.append(sentence.strip()) if len(sentences) > 3: break if isinstance(item, BlogItem): title = html_escape(item.title) if regex_ext: title = regex_ext.sub(matcher, title) date = item.pub_date type_ = 'blog' else: if not item.blogitem: item.correct_blogitem_parent() title = ( "Comment on <em>%s</em>" % html_escape(item.blogitem.title) ) date = item.add_date type_ = 'comment' documents.append({ 'title': title, 'summary': '<br>'.join(sentences), 'date': date, 'url': item.get_absolute_url(), 'type': type_, }) def create_search(s): words = re.findall('\w+', s) words_orig = words[:] if 'or' in words: which = words.index('or') words_orig.remove('or') if (which + 1) < len(words) and which > 0: before = words.pop(which - 1) words.pop(which - 1) after = words.pop(which - 1) words.insert(which - 1, '%s | %s' % (before, after)) while 'and' in words_orig: words_orig.remove('and') while 'and' in words: words.remove('and') escaped = ' & '.join(words) return escaped, words_orig data['q'] = search keyword_search = {} if len(search) > 1: _keyword_keys = ('keyword', 'keywords', 'category', 'categories') search, keyword_search = split_search(search, _keyword_keys) not_ids = defaultdict(set) times = [] search_times = [] count_documents = [] regex = regex_ext = None def append_queryset_search(queryset, order_by, words, model_name): count = items.count() count_documents.append(count) for item in items.order_by(order_by)[:20]: append_match(item, words) not_ids[model_name].add(item.pk) return count now = utc_now() if len(search) > 1: search_escaped, words = create_search(search) regex = re.compile( r'\b(%s)' % '|'.join( re.escape(word) for word in words if word.lower() not in STOPWORDS ), re.I | re.U ) regex_ext = re.compile( r'\b(%s\w*)\b' % '|'.join( re.escape(word) for word in words if word.lower() not in STOPWORDS ), re.I | re.U ) for model in (BlogItem, BlogComment): qs = model.objects model_name = model._meta.object_name if model == BlogItem: qs = qs.filter(pub_date__lte=now) fields = ('title', 'text') order_by = '-pub_date' if keyword_search.get('keyword'): qs = qs.filter( proper_keywords__contains=[keyword_search['keyword']] ) if keyword_search.get('keywords'): keywords = keyword_search['keywords'] keywords = [ x.strip() for x in keywords.split( ',' in keywords and ',' or None ) if x.strip() ] qs = qs.filter( proper_keywords__overlap=keywords ) elif model == BlogComment: fields = ('comment',) order_by = '-add_date' _specials = ('keyword', 'keywords', 'category', 'categories') if any(keyword_search.get(k) for k in _specials): # BlogComments don't have this keyword so it can # never match continue for field in fields: if not_ids[model_name]: qs = qs.exclude(pk__in=not_ids[model_name]) _sql = "to_tsvector('english'," + field + ") " if ' | ' in search_escaped or ' & ' in search_escaped: _sql += "@@ to_tsquery('english', %s)" else: _sql += "@@ plainto_tsquery('english', %s)" items = qs.extra(where=[_sql], params=[search_escaped]) t0 = time.time() count = append_queryset_search( items, order_by, words, model_name ) t1 = time.time() times.append('%s to find %s %ss by field %s' % ( t1 - t0, count, model_name, field )) search_times.append(t1-t0) logger.info('Searchin for %r:\n%s' % (search, '\n'.join(times))) elif keyword_search and any(keyword_search.values()): t0 = time.time() if keyword_search.get('keyword') or keyword_search.get('keywords'): if keyword_search.get('keyword'): assert isinstance(keyword_search['keyword'], basestring) items = BlogItem.objects.filter( pub_date__lt=timezone.now(), proper_keywords__contains=[keyword_search['keyword']] ).order_by('-pub_date') elif keyword_search.get('keywords'): keywords = keyword_search['keywords'] keywords = [ x.strip() for x in keywords.split( ',' in keywords and ',' or None ) if x.strip() ] items = BlogItem.objects.filter( pub_date__lt=timezone.now(), proper_keywords__overlap=keywords ).order_by('-pub_date') model_name = BlogItem._meta.object_name append_queryset_search(items, '-pub_date', [], model_name) if keyword_search.get('category') or keyword_search.get('categories'): if keyword_search.get('category'): categories = Category.objects.filter( name=keyword_search.get('category') ) else: cats = [x.strip() for x in keyword_search.get('categories').split(',') if x.strip()] categories = Category.objects.filter(name__in=cats) if categories: cat_q = make_categories_q(categories) items = BlogItem.objects.filter(cat_q) model_name = BlogItem._meta.object_name append_queryset_search(items, '-pub_date', [], model_name) t1 = time.time() search_times.append(t1 - t0) data['search_time'] = sum(search_times) count_documents_shown = len(documents) data['documents'] = documents data['count_documents'] = sum(count_documents) data['count_documents_shown'] = count_documents_shown data['better'] = None if not data['count_documents']: _qterms = len(data['q'].split()) if ' or ' not in data['q'] and _qterms > 1 and _qterms < 5: data['better'] = data['q'].replace(' ', ' or ') if data['better']: data['better_url'] = ( reverse('search') + '?' + urllib.urlencode({'q': data['better'].encode('utf-8')}) ) if not data['q']: page_title = 'Search' elif data['count_documents'] == 1: page_title = '1 thing found' else: page_title = '%s things found' % data['count_documents'] if count_documents_shown < data['count_documents']: if count_documents_shown == 1: page_title += ' (but only 1 thing shown)' else: page_title += ' (but only %s things shown)' % count_documents_shown data['page_title'] = page_title if ( not data['count_documents'] and len(search.split()) == 1 and not keyword_search ): if BlogItem.objects.filter( proper_keywords__overlap=[search], pub_date__lt=timezone.now() ): url = reverse('search') url += '?' + urllib.urlencode({'q': 'keyword:%s' % search}) return redirect(url) return render(request, 'homepage/search.html', data)