def test_startswith_multiple_tokens(self): index = Index(name="test") doc1 = CompanyDocument(company_name="Google", company_type="LLC") doc2 = CompanyDocument(company_name="Potato", company_type="Ltd.") doc3 = CompanyDocument(company_name="Facebook", company_type="Inc.") doc4 = CompanyDocument(company_name="Awesome", company_type="LLC") doc5 = CompanyDocument(company_name="Google", company_type="Ltd.") index.add(doc1) index.add(doc2) index.add(doc3) index.add(doc4) index.add(doc5) results = [ (x.company_name, x.company_type) for x in index.search("goo llc", document_class=CompanyDocument, use_startswith=True) ] self.assertCountEqual(results, [("Google", "LLC")]) results = [ (x.company_name, x.company_type) for x in index.search("pot ltd", document_class=CompanyDocument, use_startswith=True) ] self.assertCountEqual(results, [("Potato", "Ltd.")])
def test_startswith_with_multiple_results_per_token(self): """ The problem here is that doing startswith matches can return multiple matching tokens from the database, for a single input token. e.g. in this example searching for "test" will return matches for "testing" and "test. This caused a bug when document matching would use token counts to determine if a document matched a search string. """ index = Index(name="test") doc1 = CompanyDocument(company_name="Internal testing test", company_type="LLC") doc2 = CompanyDocument(company_name="My test", company_type="Ltd") index.add(doc1) index.add(doc2) results = [ x.company_name for x in index.search("test ltd", CompanyDocument, use_startswith=True) ] self.assertEqual(len(results), 1) self.assertEqual(results[0], "My test")
def test_or_queries(self): class Doc(Document): text = fields.TextField() index = Index(name="test") index.add(Doc(text="test string one")) index.add(Doc(text="test string two")) results = list(index.search("one OR two", Doc, match_all=True)) self.assertEqual(len(results), 2)
def test_pipe_not_indexed(self): """ The | symbols is used for TokenFieldIndex key generation so shouldn't be indexed... ever! """ class Doc(Document): name = fields.TextField() index = Index(name="test") index.add(Doc(name="|| Pipes")) self.assertEqual(index.document_count(), 1) self.assertEqual(TokenFieldIndex.objects.count(), 1) # Just "pipes"
def test_search_queue_add_to_index_does_not_defer(self, defer_mock): class Doc(Document): text = fields.TextField() index = Index("test") doc1 = Doc(text="about") index.add(doc1) self.assertTrue(doc1.persisted) self.process_task_queues() self.assertFalse(defer_mock.called)
def test_null_validation(self): """ If a field is marked as null=False, and someone tries to index None, then an IntegrityError should throw. None of the documents should be indexed if one of them is invalid. """ class Doc(Document): text = fields.TextField(null=False) index = Index("test") doc1 = Doc(text="test") doc2 = Doc(text=None) self.assertRaises(IntegrityError, index.add, [doc1, doc2]) self.assertEqual(index.document_count(), 0) # Nothing should've been indexed
def test_indexing_text_fields(self): class Doc(Document): text = fields.TextField() doc = Doc(text="This is a test. Cheese.") doc2 = Doc(text="This is also a test. Pickle.") index = Index(name="My Index") index.add(doc) index.add(doc2) # We should have some generated IDs now self.assertTrue(doc.id) self.assertTrue(doc2.id) results = [x for x in index.search("test", document_class=Doc)] # Both documents should have come back self.assertCountEqual( [doc.id, doc2.id], [x.id for x in results] ) results = [x for x in index.search("TEST", document_class=Doc)] # Both documents should have come back self.assertCountEqual( [doc.id, doc2.id], [x.id for x in results] ) results = [x for x in index.search("cheese OR pickle", document_class=Doc)] # Both documents should have come back self.assertCountEqual( [doc.id, doc2.id], [x.id for x in results] ) results = [x for x in index.search('cheese OR text:pickle', document_class=Doc)] # Both documents should have come back self.assertCountEqual( [doc.id, doc2.id], [x.id for x in results] )
def index(cls): meta = cls._meta() index_name = getattr(meta, "index", "") if meta else None if not index_name: index_name = cls.__name__ return Index(name=index_name)
def test_document_revision(self): """ Revisions exist to counter the problem that deletion from the index may take some time. The revision is replicated onto index entries so that new indexes can be created while old ones are being deleted. It doesn't protect against the eventual consistency of searching, it just means that we don't need to index inline. """ class Doc(Document): text = fields.TextField() index = Index("test") doc1 = Doc(text="about") index.add(doc1) self.assertTrue(doc1.persisted) rev = doc1.revision self.assertIsNotNone(rev) self.assertEqual( TokenFieldIndex.objects.filter(record_id=doc1.id).count(), 1) # Adding an existing document again will update the revision index.add(doc1) self.assertNotEqual(doc1.revision, rev) rev = doc1.revision self.assertEqual(TokenFieldIndex.objects.count(), 2) self.assertEqual( TokenFieldIndex.objects.filter(record_id=doc1.id, revision=doc1.revision).count(), 1) # Remove then re-add should reset the revision self.assertEqual(index.remove(doc1), 1) index.add(doc1) self.assertNotEqual(doc1.revision, rev) self.assertEqual(TokenFieldIndex.objects.count(), 3) self.assertEqual( TokenFieldIndex.objects.filter(record_id=doc1.id, revision=doc1.revision).count(), 1) # Clean up everything self.process_task_queues() self.assertEqual(TokenFieldIndex.objects.count(), 1)
def test_trailing_period(self): class Doc(Document): text = fields.TextField() index = Index(name="test") index.add(Doc(text="My company ltd.")) index.add(Doc(text="Company co.")) results = list(index.search("co", Doc)) self.assertEqual(len(results), 1) results = list(index.search("co.", Doc)) self.assertEqual(len(results), 1) results = list(index.search("ltd", Doc)) self.assertEqual(len(results), 1) results = list(index.search("ltd.", Doc)) self.assertEqual(len(results), 1)
def test_indexing_atom_fields(self): class Doc(Document): atom = fields.AtomField() doc1 = Doc(atom="This is a test") doc2 = Doc(atom="This is also a test") doc3 = Doc(atom="This") index = Index(name="MyIndex") index.add(doc1) index.add(doc2) # Exact match, or exact field match should return doc1 self.assertTrue(doc1 in index.search('atom:"This is a test"')) self.assertFalse(doc2 in index.search('atom:"This is a test"')) self.assertTrue(doc1 in index.search('"This is a test"')) # Partial match should only return exact atom matches self.assertTrue(doc3 in index.search('This')) self.assertFalse(doc1 in index.search('This')) self.assertFalse(doc2 in index.search('This'))
def test_startswith_matching(self): index = Index(name="test") doc1 = CompanyDocument(company_name="Google") doc2 = CompanyDocument(company_name="Potato") doc3 = CompanyDocument(company_name="Facebook") doc4 = CompanyDocument(company_name="Potential Company") index.add(doc1) index.add(doc2) index.add(doc3) index.add(doc4) results = [x.company_name for x in index.search("goo", document_class=CompanyDocument, use_startswith=True)] self.assertCountEqual(results, ["Google"]) results = [x.company_name for x in index.search("pot", document_class=CompanyDocument, use_startswith=True)] self.assertCountEqual(results, ["Potato", "Potential Company"]) results = [x.company_name for x in index.search("pota", document_class=CompanyDocument, use_startswith=True)] self.assertCountEqual(results, ["Potato"])
def test_datefield_querying(self): class Doc(Document): datefield = fields.DateField() date = datetime(year=2020, month=1, day=1, hour=6, minute=15) tomorrow = date + timedelta(days=1) index = Index(name="test") index.add(Doc(datefield=date)) index.add(Doc(datefield=tomorrow)) results = [x for x in index.search("2020-01-01", document_class=Doc)] self.assertEqual(len(results), 1) self.assertEqual(results[0].datefield, date)
def test_stopwords_indexed(self): """ Stop words should be indexed. They should be ranked lower and not included in searches if match_stopwords is False """ class Doc(Document): text = fields.TextField() index = Index("test") doc1 = Doc(text="about") index.add(doc1) self.assertTrue(list(index.search("about", Doc))) self.assertTrue(list(index.search("abo", Doc, use_startswith=True))) self.assertFalse(list(index.search("about", Doc, match_stopwords=False))) # Startswith matching overrides matching of stopwords (as other tokens may start with the stop word) self.assertTrue(list(index.search("about", Doc, use_startswith=True, match_stopwords=False)))
def test_acronyms(self): class Doc(Document): text = fields.TextField() index = Index(name="test") doc1 = index.add(Doc(text="a.b.c")) doc2 = index.add(Doc(text="1-2-3")) index.add(Doc(text="do-re-mi")) results = list(index.search("a.b.c", Doc)) self.assertEqual(len(results), 1) self.assertEqual(results[0].id, doc1) results = list(index.search("abc", Doc)) self.assertEqual(len(results), 1) self.assertEqual(results[0].id, doc1) results = list(index.search("a-b-c", Doc)) self.assertEqual(len(results), 1) self.assertEqual(results[0].id, doc1) results = list(index.search("1-2-3", Doc)) self.assertEqual(len(results), 1) self.assertEqual(results[0].id, doc2)
def test_ordered_by_rank(self): class Doc(Document): text = fields.TextField() rank = fields.NumberField() index = Index(name="test") doc1 = index.add(Doc(text="test", rank=100)) doc2 = index.add(Doc(text="test", rank=50)) doc3 = index.add(Doc(text="test", rank=150)) results = list(index.search("test", Doc, order_by="rank")) self.assertEqual(results[0].id, doc2) self.assertEqual(results[1].id, doc1) self.assertEqual(results[2].id, doc3)
def test_search_queue_reindex_queue_override(self, defer_mock): class Doc(Document): text = fields.TextField() index = Index("test") doc1 = Doc(text="about") index.add(doc1) # Adding an existing document again will update the revision index.add(doc1) self.assertTrue(doc1.persisted) self.process_task_queues() defer_mock.assert_called_with(mock.ANY, mock.ANY, mock.ANY, _queue="default", _shards=1)
def test_match_all_flag(self): class Doc(Document): text = fields.TextField() index = Index(name="test") doc1 = index.add(Doc(text="test string one")) doc2 = index.add(Doc(text="test string two")) results = list(index.search("test string", Doc, match_all=True)) self.assertEqual(len(results), 2) results = list(index.search("string one", Doc, match_all=True)) self.assertEqual(len(results), 1) self.assertEqual(results[0].id, doc1) results = list(index.search("test two", Doc, match_all=True)) self.assertEqual(len(results), 1) self.assertEqual(results[0].id, doc2) # Should return both as we're defaulting to OR behaviour results = list(index.search("string one", Doc, match_all=False)) self.assertEqual(len(results), 2)
def test_default_ordering_is_sensible(self): """ Ranking should be as follows: - Stopwords match weakest - When startswith matching is enabled, closer matches to the searched term will be stronger """ class Doc(Document): text = fields.TextField() def __repr__(self): return "<Document %s>" % self.text index = Index(name="test") doc1 = Doc(text="all about you") # All stopwords doc2 = Doc(text="ready to rumble") # 2 stopwords doc3 = Doc(text="live forever") # no stopwords doc4 = Doc(text="live and let die") # 1 stop word index.add([doc1, doc2, doc3, doc4]) results = list(index.search("live to forever", Doc, match_all=False)) expected_order = [ doc3, # live forever doc4, # live doc2, # to ] self.assertEqual(results, expected_order) results = list(index.search("all about forever and", Doc, match_all=False)) expected_order = [ doc3, # live forever doc1, # all about doc4, # and ] self.assertEqual(results, expected_order)
def test_field_index_flag_respected(self): class Doc(Document): text = fields.TextField() other_text = fields.TextField(index=False) index = Index("test") doc1 = Doc(text="foo", other_text="bar") doc2 = Doc(text="bar", other_text="foo") index.add([doc1, doc2]) results = list(index.search("foo", Doc)) self.assertEqual(len(results), 1) self.assertEqual(results[0].text, "foo") self.assertEqual(results[0].other_text, "bar") results = list(index.search("bar", Doc)) self.assertEqual(len(results), 1) self.assertEqual(results[0].text, "bar") self.assertEqual(results[0].other_text, "foo")
def test_number_field_querying(self): class Doc(Document): number = fields.NumberField() index = Index(name="test") doc1 = index.add(Doc(number=1)) doc2 = index.add(Doc(number=2341920)) results = [x for x in index.search("number:1", document_class=Doc)] # Should only return the exact match self.assertEqual(len(results), 1) self.assertEqual(results[0].id, doc1) results = [x for x in index.search("1", document_class=Doc)] # Should only return the exact match self.assertEqual(len(results), 1) self.assertEqual(results[0].id, doc1) results = [x for x in index.search("2341920", document_class=Doc)] self.assertEqual(len(results), 1) self.assertEqual(results[0].id, doc2)
def test_removing_document(self): class Doc(Document): text = fields.TextField() i0 = Index(name="index1") i1 = Index(name="index2") d0 = i0.add(Doc(text="One")) # One field, one token self.assertEqual( TokenFieldIndex.objects.count(), 1 ) self.assertEqual(i0.document_count(), 1) self.assertEqual(i1.document_count(), 0) d1 = i0.add(Doc(text="Two")) # Two fields, one token each self.assertEqual( TokenFieldIndex.objects.count(), 2 ) self.assertEqual(i0.document_count(), 2) self.assertEqual(i1.document_count(), 0) d2 = i1.add(Doc(text="Three 3")) # Three fields, one token each except last which has 2 self.assertEqual( TokenFieldIndex.objects.count(), 4 ) self.assertEqual(i0.document_count(), 2) self.assertEqual(i1.document_count(), 1) self.assertTrue(i0.remove(d0)) self.assertFalse(i0.remove(d0)) self.assertEqual(i0.document_count(), 1) self.assertEqual(i1.document_count(), 1) self.assertEqual( TokenFieldIndex.objects.count(), 3 ) self.assertFalse([x for x in i0.search("text:One", Doc)]) self.assertTrue(i0.remove(d1)) self.assertEqual(i0.document_count(), 0) self.assertEqual(i1.document_count(), 1) self.assertEqual( TokenFieldIndex.objects.count(), 2 ) self.assertFalse([x for x in i0.search("text:Two", Doc)]) self.assertTrue(i1.remove(d2)) self.assertEqual(i0.document_count(), 0) self.assertEqual(i1.document_count(), 0) self.assertEqual( TokenFieldIndex.objects.count(), 0 ) self.assertFalse([x for x in i1.search("text:Three", Doc)]) self.assertFalse([x for x in i1.search("text:3", Doc)])