def test_search_more_like(self): d1 = Document.objects.create( title="invoice", content="the thing i bought at a shop and paid with bank account", checksum="A", pk=1) d2 = Document.objects.create(title="bank statement 1", content="things i paid for in august", pk=2, checksum="B") d3 = Document.objects.create(title="bank statement 3", content="things i paid for in september", pk=3, checksum="C") with AsyncWriter(index.open_index()) as writer: index.update_document(writer, d1) index.update_document(writer, d2) index.update_document(writer, d3) response = self.client.get(f"/api/search/?more_like={d2.id}") self.assertEqual(response.status_code, 200) results = response.data['results'] self.assertEqual(len(results), 2) self.assertEqual(results[0]['id'], d3.id) self.assertEqual(results[1]['id'], d1.id)
def test_search_invalid_page(self): with index.open_index(False).writer() as writer: for i in range(15): doc = Document.objects.create(checksum=str(i), pk=i + 1, title=f"Document {i+1}", content="content") index.update_document(writer, doc) first_page = self.client.get(f"/api/search/?query=content&page=1").data second_page = self.client.get( f"/api/search/?query=content&page=2").data should_be_first_page_1 = self.client.get( f"/api/search/?query=content&page=0").data should_be_first_page_2 = self.client.get( f"/api/search/?query=content&page=dgfd").data should_be_first_page_3 = self.client.get( f"/api/search/?query=content&page=").data should_be_first_page_4 = self.client.get( f"/api/search/?query=content&page=-7868").data self.assertDictEqual(first_page, should_be_first_page_1) self.assertDictEqual(first_page, should_be_first_page_2) self.assertDictEqual(first_page, should_be_first_page_3) self.assertDictEqual(first_page, should_be_first_page_4) self.assertNotEqual(len(first_page['results']), len(second_page['results']))
def test_auto_complete(self): doc1 = Document.objects.create(title="doc1", checksum="A", content="test test2 test3") doc2 = Document.objects.create(title="doc2", checksum="B", content="test test2") doc3 = Document.objects.create(title="doc3", checksum="C", content="test2") index.add_or_update_document(doc1) index.add_or_update_document(doc2) index.add_or_update_document(doc3) ix = index.open_index() self.assertListEqual(index.autocomplete(ix, "tes"), [b"test3", b"test", b"test2"]) self.assertListEqual(index.autocomplete(ix, "tes", limit=3), [b"test3", b"test", b"test2"]) self.assertListEqual(index.autocomplete(ix, "tes", limit=1), [b"test3"]) self.assertListEqual(index.autocomplete(ix, "tes", limit=0), [])
def index_reindex(): documents = Document.objects.all() ix = index.open_index(recreate=True) with AsyncWriter(ix) as writer: for document in tqdm.tqdm(documents): index.update_document(writer, document)
def index_reindex(progress_bar_disable=False): documents = Document.objects.all() ix = index.open_index(recreate=True) with AsyncWriter(ix) as writer: for document in tqdm.tqdm(documents, disable=progress_bar_disable): index.update_document(writer, document)
def delete(doc_ids): Document.objects.filter(id__in=doc_ids).delete() ix = index.open_index() with AsyncWriter(ix) as writer: for id in doc_ids: index.remove_document_by_id(writer, id) return "OK"
def bulk_update_documents(document_ids): documents = Document.objects.filter(id__in=document_ids) ix = index.open_index() for doc in documents: post_save.send(Document, instance=doc, created=False) with AsyncWriter(ix) as writer: for doc in documents: index.update_document(writer, doc)
def get(self, request, format=None): from documents import index if 'query' in request.query_params: query = request.query_params['query'] else: query = None if 'more_like' in request.query_params: more_like_id = request.query_params['more_like'] more_like_content = Document.objects.get(id=more_like_id).content else: more_like_id = None more_like_content = None if not query and not more_like_id: return Response({ 'count': 0, 'page': 0, 'page_count': 0, 'corrected_query': None, 'results': [] }) try: page = int(request.query_params.get('page', 1)) except (ValueError, TypeError): page = 1 if page < 1: page = 1 ix = index.open_index() try: with index.query_page( ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query): # NOQA: E501 return Response({ 'count': len(result_page), 'page': result_page.pagenum, 'page_count': result_page.pagecount, 'corrected_query': corrected_query, 'results': list(map(self.add_infos_to_hit, result_page)) }) except Exception as e: return HttpResponseBadRequest(str(e))
def test_search_spelling_correction(self): with AsyncWriter(index.open_index()) as writer: for i in range(55): doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content=f"Things document {i+1}") index.update_document(writer, doc) response = self.client.get("/api/search/?query=thing") correction = response.data['corrected_query'] self.assertEqual(correction, "things") response = self.client.get("/api/search/?query=things") correction = response.data['corrected_query'] self.assertEqual(correction, None)
def test_search(self): d1 = Document.objects.create( title="invoice", content="the thing i bought at a shop and paid with bank account", checksum="A", pk=1) d2 = Document.objects.create(title="bank statement 1", content="things i paid for in august", pk=2, checksum="B") d3 = Document.objects.create(title="bank statement 3", content="things i paid for in september", pk=3, checksum="C") with index.open_index(False).writer() as writer: # Note to future self: there is a reason we dont use a model signal handler to update the index: some operations edit many documents at once # (retagger, renamer) and we don't want to open a writer for each of these, but rather perform the entire operation with one writer. # That's why we cant open the writer in a model on_save handler or something. index.update_document(writer, d1) index.update_document(writer, d2) index.update_document(writer, d3) response = self.client.get("/api/search/?query=bank") results = response.data['results'] self.assertEqual(response.data['count'], 3) self.assertEqual(response.data['page'], 1) self.assertEqual(response.data['page_count'], 1) self.assertEqual(len(results), 3) response = self.client.get("/api/search/?query=september") results = response.data['results'] self.assertEqual(response.data['count'], 1) self.assertEqual(response.data['page'], 1) self.assertEqual(response.data['page_count'], 1) self.assertEqual(len(results), 1) response = self.client.get("/api/search/?query=statement") results = response.data['results'] self.assertEqual(response.data['count'], 2) self.assertEqual(response.data['page'], 1) self.assertEqual(response.data['page_count'], 1) self.assertEqual(len(results), 2) response = self.client.get("/api/search/?query=sfegdfg") results = response.data['results'] self.assertEqual(response.data['count'], 0) self.assertEqual(response.data['page'], 0) self.assertEqual(response.data['page_count'], 0) self.assertEqual(len(results), 0)
def get(self, request, format=None): if 'term' in request.query_params: term = request.query_params['term'] else: return HttpResponseBadRequest("Term required") if 'limit' in request.query_params: limit = int(request.query_params['limit']) if limit <= 0: return HttpResponseBadRequest("Invalid limit") else: limit = 10 from documents import index ix = index.open_index() return Response(index.autocomplete(ix, term, limit))
def test_search_multi_page(self): with index.open_index(False).writer() as writer: for i in range(55): doc = Document.objects.create(checksum=str(i), pk=i + 1, title=f"Document {i+1}", content="content") index.update_document(writer, doc) # This is here so that we test that no document gets returned twice (might happen if the paging is not working) seen_ids = [] for i in range(1, 6): response = self.client.get(f"/api/search/?query=content&page={i}") results = response.data['results'] self.assertEqual(response.data['count'], 55) self.assertEqual(response.data['page'], i) self.assertEqual(response.data['page_count'], 6) self.assertEqual(len(results), 10) for result in results: self.assertNotIn(result['id'], seen_ids) seen_ids.append(result['id']) response = self.client.get(f"/api/search/?query=content&page=6") results = response.data['results'] self.assertEqual(response.data['count'], 55) self.assertEqual(response.data['page'], 6) self.assertEqual(response.data['page_count'], 6) self.assertEqual(len(results), 5) for result in results: self.assertNotIn(result['id'], seen_ids) seen_ids.append(result['id']) response = self.client.get(f"/api/search/?query=content&page=7") results = response.data['results'] self.assertEqual(response.data['count'], 55) self.assertEqual(response.data['page'], 6) self.assertEqual(response.data['page_count'], 6) self.assertEqual(len(results), 5)
def __init__(self, *args, **kwargs): super(SearchAutoCompleteView, self).__init__(*args, **kwargs) self.ix = index.open_index()
def index_optimize(): ix = index.open_index() writer = AsyncWriter(ix) writer.commit(optimize=True)
def get_document_from_index(self, doc): ix = index.open_index() with ix.searcher() as searcher: return searcher.document(id=doc.id)
def index_optimize(): index.open_index().optimize()