Beispiel #1
0
    def test_startswith_with_multiple_results_per_token(self):
        """
            The problem here is that doing startswith matches can return multiple
            matching tokens from the database, for a single input token. e.g.
            in this example searching for "test" will return matches for "testing" and "test.

            This caused a bug when document matching would use token counts to determine
            if a document matched a search string.
        """

        index = Index(name="test")

        doc1 = CompanyDocument(company_name="Internal testing test", company_type="LLC")
        doc2 = CompanyDocument(company_name="My test", company_type="Ltd")

        index.add(doc1)
        index.add(doc2)

        results = [
            x.company_name
            for x in index.search("test ltd", CompanyDocument, use_startswith=True)
        ]

        self.assertEqual(len(results), 1)
        self.assertEqual(results[0], "My test")
Beispiel #2
0
    def test_removing_document(self):
        class Doc(Document):
            text = fields.TextField()

        i0 = Index(name="index1")
        i1 = Index(name="index2")

        d0 = i0.add(Doc(text="One"))

        # One field, one token
        self.assertEqual(TokenFieldIndex.objects.count(), 1)

        self.assertEqual(i0.document_count(), 1)
        self.assertEqual(i1.document_count(), 0)

        d1 = i0.add(Doc(text="Two"))

        # Two fields, one token each
        self.assertEqual(TokenFieldIndex.objects.count(), 2)

        self.assertEqual(i0.document_count(), 2)
        self.assertEqual(i1.document_count(), 0)

        d2 = i1.add(Doc(text="Three 3"))

        # Three fields, one token each except last which has 2
        self.assertEqual(TokenFieldIndex.objects.count(), 4)

        self.assertEqual(i0.document_count(), 2)
        self.assertEqual(i1.document_count(), 1)

        self.assertTrue(i0.remove(d0))
        self.assertFalse(i0.remove(d0))

        self.assertEqual(i0.document_count(), 1)
        self.assertEqual(i1.document_count(), 1)

        self.assertEqual(TokenFieldIndex.objects.count(), 3)

        self.assertFalse([x for x in i0.search("text:One", Doc)])

        self.assertTrue(i0.remove(d1))

        self.assertEqual(i0.document_count(), 0)
        self.assertEqual(i1.document_count(), 1)

        self.assertEqual(TokenFieldIndex.objects.count(), 2)

        self.assertFalse([x for x in i0.search("text:Two", Doc)])

        self.assertTrue(i1.remove(d2))

        self.assertEqual(i0.document_count(), 0)
        self.assertEqual(i1.document_count(), 0)

        self.assertEqual(TokenFieldIndex.objects.count(), 0)

        self.assertFalse([x for x in i1.search("text:Three", Doc)])
        self.assertFalse([x for x in i1.search("text:3", Doc)])
Beispiel #3
0
    def test_or_queries(self):
        class Doc(Document):
            text = fields.TextField()

        index = Index(name="test")
        index.add(Doc(text="test string one"))
        index.add(Doc(text="test string two"))

        results = list(index.search("one OR two", Doc, match_all=True))
        self.assertEqual(len(results), 2)
Beispiel #4
0
    def test_search_queue_add_to_index_does_not_defer(self, defer_mock):
        class Doc(Document):
            text = fields.TextField()

        index = Index("test")
        doc1 = Doc(text="about")

        index.add(doc1)
        self.assertTrue(doc1.persisted)

        self.process_task_queues()

        self.assertFalse(defer_mock.called)
Beispiel #5
0
    def test_pipe_not_indexed(self):
        """
            The | symbols is used for TokenFieldIndex key generation
            so shouldn't be indexed... ever!
        """
        class Doc(Document):
            name = fields.TextField()

        index = Index(name="test")
        index.add(Doc(name="|| Pipes"))

        self.assertEqual(index.document_count(), 1)
        self.assertEqual(TokenFieldIndex.objects.count(), 1)  # Just "pipes"
Beispiel #6
0
    def test_datefield_querying(self):
        class Doc(Document):
            datefield = fields.DateField()

        date = datetime(year=2020, month=1, day=1, hour=6, minute=15)
        tomorrow = date + timedelta(days=1)

        index = Index(name="test")
        index.add(Doc(datefield=date))
        index.add(Doc(datefield=tomorrow))

        results = [x for x in index.search("2020-01-01", document_class=Doc)]
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].datefield, date)
Beispiel #7
0
    def test_ordered_by_rank(self):
        class Doc(Document):
            text = fields.TextField()
            rank = fields.NumberField()

        index = Index(name="test")
        doc1 = index.add(Doc(text="test", rank=100))
        doc2 = index.add(Doc(text="test", rank=50))
        doc3 = index.add(Doc(text="test", rank=150))

        results = list(index.search("test", Doc, order_by="rank"))

        self.assertEqual(results[0].id, doc2)
        self.assertEqual(results[1].id, doc1)
        self.assertEqual(results[2].id, doc3)
    def test_indexing_text_fields(self):
        class Doc(Document):
            text = fields.TextField()

        doc = Doc(text="This is a test. Cheese.")
        doc2 = Doc(text="This is also a test. Pickle.")

        index = Index(name="My Index")
        index.add(doc)
        index.add(doc2)

        # We should have some generated IDs now
        self.assertTrue(doc.id)
        self.assertTrue(doc2.id)

        results = [x for x in index.search("test", document_class=Doc)]

        # Both documents should have come back
        self.assertCountEqual(
            [doc.id, doc2.id],
            [x.id for x in results]
        )

        results = [x for x in index.search("TEST", document_class=Doc)]

        # Both documents should have come back
        self.assertCountEqual(
            [doc.id, doc2.id],
            [x.id for x in results]
        )

        results = [x for x in index.search("cheese OR pickle", document_class=Doc)]

        # Both documents should have come back
        self.assertCountEqual(
            [doc.id, doc2.id],
            [x.id for x in results]
        )

        results = [x for x in index.search('cheese OR text:pickle', document_class=Doc)]

        # Both documents should have come back
        self.assertCountEqual(
            [doc.id, doc2.id],
            [x.id for x in results]
        )
Beispiel #9
0
    def test_search_queue_reindex_queue_override(self, defer_mock):
        class Doc(Document):
            text = fields.TextField()

        index = Index("test")
        doc1 = Doc(text="about")

        index.add(doc1)
        # Adding an existing document again will update the revision
        index.add(doc1)
        self.assertTrue(doc1.persisted)

        self.process_task_queues()
        defer_mock.assert_called_with(mock.ANY,
                                      mock.ANY,
                                      mock.ANY,
                                      _queue="default",
                                      _shards=1)
    def test_stopwords_indexed(self):
        """
            Stop words should be indexed. They should be ranked lower
            and not included in searches if match_stopwords is False
        """

        class Doc(Document):
            text = fields.TextField()

        index = Index("test")
        doc1 = Doc(text="about")
        index.add(doc1)

        self.assertTrue(list(index.search("about", Doc)))
        self.assertTrue(list(index.search("abo", Doc, use_startswith=True)))
        self.assertFalse(list(index.search("about", Doc, match_stopwords=False)))

        # Startswith matching overrides matching of stopwords (as other tokens may start with the stop word)
        self.assertTrue(list(index.search("about", Doc, use_startswith=True, match_stopwords=False)))
Beispiel #11
0
    def test_trailing_period(self):
        class Doc(Document):
            text = fields.TextField()

        index = Index(name="test")
        index.add(Doc(text="My company ltd."))
        index.add(Doc(text="Company co."))

        results = list(index.search("co", Doc))
        self.assertEqual(len(results), 1)

        results = list(index.search("co.", Doc))
        self.assertEqual(len(results), 1)

        results = list(index.search("ltd", Doc))
        self.assertEqual(len(results), 1)

        results = list(index.search("ltd.", Doc))
        self.assertEqual(len(results), 1)
Beispiel #12
0
    def test_default_ordering_is_sensible(self):
        """
            Ranking should be as follows:

             - Stopwords match weakest
             - When startswith matching is enabled, closer matches to the
               searched term will be stronger
        """

        class Doc(Document):
            text = fields.TextField()

            def __repr__(self):
                return "<Document %s>" % self.text

        index = Index(name="test")

        doc1 = Doc(text="all about you")  # All stopwords
        doc2 = Doc(text="ready to rumble")  # 2 stopwords
        doc3 = Doc(text="live forever")  # no stopwords
        doc4 = Doc(text="live and let die")  # 1 stop word
        index.add([doc1, doc2, doc3, doc4])

        results = list(index.search("live to forever", Doc, match_all=False))

        expected_order = [
            doc3,  # live forever
            doc4,  # live
            doc2,  # to
        ]

        self.assertEqual(results, expected_order)

        results = list(index.search("all about forever and", Doc, match_all=False))

        expected_order = [
            doc3,  # live forever
            doc1,  # all about
            doc4,  # and
        ]

        self.assertEqual(results, expected_order)
    def test_field_index_flag_respected(self):
        class Doc(Document):
            text = fields.TextField()
            other_text = fields.TextField(index=False)

        index = Index("test")
        doc1 = Doc(text="foo", other_text="bar")
        doc2 = Doc(text="bar", other_text="foo")

        index.add([doc1, doc2])

        results = list(index.search("foo", Doc))
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].text, "foo")
        self.assertEqual(results[0].other_text, "bar")

        results = list(index.search("bar", Doc))
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].text, "bar")
        self.assertEqual(results[0].other_text, "foo")
    def test_indexing_atom_fields(self):
        class Doc(Document):
            atom = fields.AtomField()

        doc1 = Doc(atom="This is a test")
        doc2 = Doc(atom="This is also a test")
        doc3 = Doc(atom="This")

        index = Index(name="MyIndex")
        index.add(doc1)
        index.add(doc2)

        # Exact match, or exact field match should return doc1
        self.assertTrue(doc1 in index.search('atom:"This is a test"'))
        self.assertFalse(doc2 in index.search('atom:"This is a test"'))
        self.assertTrue(doc1 in index.search('"This is a test"'))

        # Partial match should only return exact atom matches
        self.assertTrue(doc3 in index.search('This'))
        self.assertFalse(doc1 in index.search('This'))
        self.assertFalse(doc2 in index.search('This'))
Beispiel #15
0
    def test_startswith_matching(self):
        index = Index(name="test")

        doc1 = CompanyDocument(company_name="Google")
        doc2 = CompanyDocument(company_name="Potato")
        doc3 = CompanyDocument(company_name="Facebook")
        doc4 = CompanyDocument(company_name="Potential Company")

        index.add(doc1)
        index.add(doc2)
        index.add(doc3)
        index.add(doc4)

        results = [x.company_name for x in index.search("goo", document_class=CompanyDocument, use_startswith=True)]
        self.assertCountEqual(results, ["Google"])

        results = [x.company_name for x in index.search("pot", document_class=CompanyDocument, use_startswith=True)]
        self.assertCountEqual(results, ["Potato", "Potential Company"])

        results = [x.company_name for x in index.search("pota", document_class=CompanyDocument, use_startswith=True)]
        self.assertCountEqual(results, ["Potato"])
Beispiel #16
0
    def test_startswith_multiple_tokens(self):
        index = Index(name="test")

        doc1 = CompanyDocument(company_name="Google", company_type="LLC")
        doc2 = CompanyDocument(company_name="Potato", company_type="Ltd.")
        doc3 = CompanyDocument(company_name="Facebook", company_type="Inc.")
        doc4 = CompanyDocument(company_name="Awesome", company_type="LLC")
        doc5 = CompanyDocument(company_name="Google", company_type="Ltd.")

        index.add(doc1)
        index.add(doc2)
        index.add(doc3)
        index.add(doc4)
        index.add(doc5)

        results = [
            (x.company_name, x.company_type)
            for x in index.search("goo llc", document_class=CompanyDocument, use_startswith=True)
        ]

        self.assertCountEqual(results, [("Google", "LLC")])

        results = [
            (x.company_name, x.company_type)
            for x in index.search("pot ltd", document_class=CompanyDocument, use_startswith=True)
        ]

        self.assertCountEqual(results, [("Potato", "Ltd.")])
Beispiel #17
0
    def test_number_field_querying(self):
        class Doc(Document):
            number = fields.NumberField()

        index = Index(name="test")

        doc1 = index.add(Doc(number=1))
        doc2 = index.add(Doc(number=2341920))

        results = [x for x in index.search("number:1", document_class=Doc)]

        # Should only return the exact match
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].id, doc1)

        results = [x for x in index.search("1", document_class=Doc)]

        # Should only return the exact match
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].id, doc1)

        results = [x for x in index.search("2341920", document_class=Doc)]
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].id, doc2)
Beispiel #18
0
    def test_acronyms(self):
        class Doc(Document):
            text = fields.TextField()

        index = Index(name="test")
        doc1 = index.add(Doc(text="a.b.c"))
        doc2 = index.add(Doc(text="1-2-3"))
        index.add(Doc(text="do-re-mi"))

        results = list(index.search("a.b.c", Doc))
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].id, doc1)

        results = list(index.search("abc", Doc))
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].id, doc1)

        results = list(index.search("a-b-c", Doc))
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].id, doc1)

        results = list(index.search("1-2-3", Doc))
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].id, doc2)
Beispiel #19
0
    def test_document_revision(self):
        """
            Revisions exist to counter the problem that deletion from
            the index may take some time. The revision is replicated onto
            index entries so that new indexes can be created while old ones
            are being deleted.

            It doesn't protect against the eventual consistency of searching,
            it just means that we don't need to index inline.
        """
        class Doc(Document):
            text = fields.TextField()

        index = Index("test")
        doc1 = Doc(text="about")

        index.add(doc1)
        self.assertTrue(doc1.persisted)

        rev = doc1.revision

        self.assertIsNotNone(rev)
        self.assertEqual(
            TokenFieldIndex.objects.filter(record_id=doc1.id).count(), 1)

        # Adding an existing document again will update the revision
        index.add(doc1)
        self.assertNotEqual(doc1.revision, rev)
        rev = doc1.revision

        self.assertEqual(TokenFieldIndex.objects.count(), 2)
        self.assertEqual(
            TokenFieldIndex.objects.filter(record_id=doc1.id,
                                           revision=doc1.revision).count(), 1)

        # Remove then re-add should reset the revision
        self.assertEqual(index.remove(doc1), 1)

        index.add(doc1)
        self.assertNotEqual(doc1.revision, rev)

        self.assertEqual(TokenFieldIndex.objects.count(), 3)
        self.assertEqual(
            TokenFieldIndex.objects.filter(record_id=doc1.id,
                                           revision=doc1.revision).count(), 1)

        # Clean up everything
        self.process_task_queues()

        self.assertEqual(TokenFieldIndex.objects.count(), 1)
Beispiel #20
0
    def test_match_all_flag(self):

        class Doc(Document):
            text = fields.TextField()

        index = Index(name="test")
        doc1 = index.add(Doc(text="test string one"))
        doc2 = index.add(Doc(text="test string two"))

        results = list(index.search("test string", Doc, match_all=True))
        self.assertEqual(len(results), 2)

        results = list(index.search("string one", Doc, match_all=True))
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].id, doc1)

        results = list(index.search("test two", Doc, match_all=True))
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].id, doc2)

        # Should return both as we're defaulting to OR behaviour
        results = list(index.search("string one", Doc, match_all=False))
        self.assertEqual(len(results), 2)