Beispiel #1
0
    def test_startswith_multiple_tokens(self):
        index = Index(name="test")

        doc1 = CompanyDocument(company_name="Google", company_type="LLC")
        doc2 = CompanyDocument(company_name="Potato", company_type="Ltd.")
        doc3 = CompanyDocument(company_name="Facebook", company_type="Inc.")
        doc4 = CompanyDocument(company_name="Awesome", company_type="LLC")
        doc5 = CompanyDocument(company_name="Google", company_type="Ltd.")

        index.add(doc1)
        index.add(doc2)
        index.add(doc3)
        index.add(doc4)
        index.add(doc5)

        results = [
            (x.company_name, x.company_type)
            for x in index.search("goo llc", document_class=CompanyDocument, use_startswith=True)
        ]

        self.assertCountEqual(results, [("Google", "LLC")])

        results = [
            (x.company_name, x.company_type)
            for x in index.search("pot ltd", document_class=CompanyDocument, use_startswith=True)
        ]

        self.assertCountEqual(results, [("Potato", "Ltd.")])
Beispiel #2
0
    def test_removing_document(self):
        class Doc(Document):
            text = fields.TextField()

        i0 = Index(name="index1")
        i1 = Index(name="index2")

        d0 = i0.add(Doc(text="One"))

        # One field, one token
        self.assertEqual(TokenFieldIndex.objects.count(), 1)

        self.assertEqual(i0.document_count(), 1)
        self.assertEqual(i1.document_count(), 0)

        d1 = i0.add(Doc(text="Two"))

        # Two fields, one token each
        self.assertEqual(TokenFieldIndex.objects.count(), 2)

        self.assertEqual(i0.document_count(), 2)
        self.assertEqual(i1.document_count(), 0)

        d2 = i1.add(Doc(text="Three 3"))

        # Three fields, one token each except last which has 2
        self.assertEqual(TokenFieldIndex.objects.count(), 4)

        self.assertEqual(i0.document_count(), 2)
        self.assertEqual(i1.document_count(), 1)

        self.assertTrue(i0.remove(d0))
        self.assertFalse(i0.remove(d0))

        self.assertEqual(i0.document_count(), 1)
        self.assertEqual(i1.document_count(), 1)

        self.assertEqual(TokenFieldIndex.objects.count(), 3)

        self.assertFalse([x for x in i0.search("text:One", Doc)])

        self.assertTrue(i0.remove(d1))

        self.assertEqual(i0.document_count(), 0)
        self.assertEqual(i1.document_count(), 1)

        self.assertEqual(TokenFieldIndex.objects.count(), 2)

        self.assertFalse([x for x in i0.search("text:Two", Doc)])

        self.assertTrue(i1.remove(d2))

        self.assertEqual(i0.document_count(), 0)
        self.assertEqual(i1.document_count(), 0)

        self.assertEqual(TokenFieldIndex.objects.count(), 0)

        self.assertFalse([x for x in i1.search("text:Three", Doc)])
        self.assertFalse([x for x in i1.search("text:3", Doc)])
Beispiel #3
0
    def test_startswith_with_multiple_results_per_token(self):
        """
            The problem here is that doing startswith matches can return multiple
            matching tokens from the database, for a single input token. e.g.
            in this example searching for "test" will return matches for "testing" and "test.

            This caused a bug when document matching would use token counts to determine
            if a document matched a search string.
        """

        index = Index(name="test")

        doc1 = CompanyDocument(company_name="Internal testing test", company_type="LLC")
        doc2 = CompanyDocument(company_name="My test", company_type="Ltd")

        index.add(doc1)
        index.add(doc2)

        results = [
            x.company_name
            for x in index.search("test ltd", CompanyDocument, use_startswith=True)
        ]

        self.assertEqual(len(results), 1)
        self.assertEqual(results[0], "My test")
    def test_indexing_text_fields(self):
        class Doc(Document):
            text = fields.TextField()

        doc = Doc(text="This is a test. Cheese.")
        doc2 = Doc(text="This is also a test. Pickle.")

        index = Index(name="My Index")
        index.add(doc)
        index.add(doc2)

        # We should have some generated IDs now
        self.assertTrue(doc.id)
        self.assertTrue(doc2.id)

        results = [x for x in index.search("test", document_class=Doc)]

        # Both documents should have come back
        self.assertCountEqual(
            [doc.id, doc2.id],
            [x.id for x in results]
        )

        results = [x for x in index.search("TEST", document_class=Doc)]

        # Both documents should have come back
        self.assertCountEqual(
            [doc.id, doc2.id],
            [x.id for x in results]
        )

        results = [x for x in index.search("cheese OR pickle", document_class=Doc)]

        # Both documents should have come back
        self.assertCountEqual(
            [doc.id, doc2.id],
            [x.id for x in results]
        )

        results = [x for x in index.search('cheese OR text:pickle', document_class=Doc)]

        # Both documents should have come back
        self.assertCountEqual(
            [doc.id, doc2.id],
            [x.id for x in results]
        )
    def test_stopwords_indexed(self):
        """
            Stop words should be indexed. They should be ranked lower
            and not included in searches if match_stopwords is False
        """

        class Doc(Document):
            text = fields.TextField()

        index = Index("test")
        doc1 = Doc(text="about")
        index.add(doc1)

        self.assertTrue(list(index.search("about", Doc)))
        self.assertTrue(list(index.search("abo", Doc, use_startswith=True)))
        self.assertFalse(list(index.search("about", Doc, match_stopwords=False)))

        # Startswith matching overrides matching of stopwords (as other tokens may start with the stop word)
        self.assertTrue(list(index.search("about", Doc, use_startswith=True, match_stopwords=False)))
Beispiel #6
0
    def test_trailing_period(self):
        class Doc(Document):
            text = fields.TextField()

        index = Index(name="test")
        index.add(Doc(text="My company ltd."))
        index.add(Doc(text="Company co."))

        results = list(index.search("co", Doc))
        self.assertEqual(len(results), 1)

        results = list(index.search("co.", Doc))
        self.assertEqual(len(results), 1)

        results = list(index.search("ltd", Doc))
        self.assertEqual(len(results), 1)

        results = list(index.search("ltd.", Doc))
        self.assertEqual(len(results), 1)
Beispiel #7
0
    def test_or_queries(self):
        class Doc(Document):
            text = fields.TextField()

        index = Index(name="test")
        index.add(Doc(text="test string one"))
        index.add(Doc(text="test string two"))

        results = list(index.search("one OR two", Doc, match_all=True))
        self.assertEqual(len(results), 2)
Beispiel #8
0
    def test_default_ordering_is_sensible(self):
        """
            Ranking should be as follows:

             - Stopwords match weakest
             - When startswith matching is enabled, closer matches to the
               searched term will be stronger
        """

        class Doc(Document):
            text = fields.TextField()

            def __repr__(self):
                return "<Document %s>" % self.text

        index = Index(name="test")

        doc1 = Doc(text="all about you")  # All stopwords
        doc2 = Doc(text="ready to rumble")  # 2 stopwords
        doc3 = Doc(text="live forever")  # no stopwords
        doc4 = Doc(text="live and let die")  # 1 stop word
        index.add([doc1, doc2, doc3, doc4])

        results = list(index.search("live to forever", Doc, match_all=False))

        expected_order = [
            doc3,  # live forever
            doc4,  # live
            doc2,  # to
        ]

        self.assertEqual(results, expected_order)

        results = list(index.search("all about forever and", Doc, match_all=False))

        expected_order = [
            doc3,  # live forever
            doc1,  # all about
            doc4,  # and
        ]

        self.assertEqual(results, expected_order)
    def test_field_index_flag_respected(self):
        class Doc(Document):
            text = fields.TextField()
            other_text = fields.TextField(index=False)

        index = Index("test")
        doc1 = Doc(text="foo", other_text="bar")
        doc2 = Doc(text="bar", other_text="foo")

        index.add([doc1, doc2])

        results = list(index.search("foo", Doc))
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].text, "foo")
        self.assertEqual(results[0].other_text, "bar")

        results = list(index.search("bar", Doc))
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].text, "bar")
        self.assertEqual(results[0].other_text, "foo")
    def test_indexing_atom_fields(self):
        class Doc(Document):
            atom = fields.AtomField()

        doc1 = Doc(atom="This is a test")
        doc2 = Doc(atom="This is also a test")
        doc3 = Doc(atom="This")

        index = Index(name="MyIndex")
        index.add(doc1)
        index.add(doc2)

        # Exact match, or exact field match should return doc1
        self.assertTrue(doc1 in index.search('atom:"This is a test"'))
        self.assertFalse(doc2 in index.search('atom:"This is a test"'))
        self.assertTrue(doc1 in index.search('"This is a test"'))

        # Partial match should only return exact atom matches
        self.assertTrue(doc3 in index.search('This'))
        self.assertFalse(doc1 in index.search('This'))
        self.assertFalse(doc2 in index.search('This'))
Beispiel #11
0
    def test_startswith_matching(self):
        index = Index(name="test")

        doc1 = CompanyDocument(company_name="Google")
        doc2 = CompanyDocument(company_name="Potato")
        doc3 = CompanyDocument(company_name="Facebook")
        doc4 = CompanyDocument(company_name="Potential Company")

        index.add(doc1)
        index.add(doc2)
        index.add(doc3)
        index.add(doc4)

        results = [x.company_name for x in index.search("goo", document_class=CompanyDocument, use_startswith=True)]
        self.assertCountEqual(results, ["Google"])

        results = [x.company_name for x in index.search("pot", document_class=CompanyDocument, use_startswith=True)]
        self.assertCountEqual(results, ["Potato", "Potential Company"])

        results = [x.company_name for x in index.search("pota", document_class=CompanyDocument, use_startswith=True)]
        self.assertCountEqual(results, ["Potato"])
Beispiel #12
0
    def test_match_all_flag(self):

        class Doc(Document):
            text = fields.TextField()

        index = Index(name="test")
        doc1 = index.add(Doc(text="test string one"))
        doc2 = index.add(Doc(text="test string two"))

        results = list(index.search("test string", Doc, match_all=True))
        self.assertEqual(len(results), 2)

        results = list(index.search("string one", Doc, match_all=True))
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].id, doc1)

        results = list(index.search("test two", Doc, match_all=True))
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].id, doc2)

        # Should return both as we're defaulting to OR behaviour
        results = list(index.search("string one", Doc, match_all=False))
        self.assertEqual(len(results), 2)
Beispiel #13
0
    def test_datefield_querying(self):
        class Doc(Document):
            datefield = fields.DateField()

        date = datetime(year=2020, month=1, day=1, hour=6, minute=15)
        tomorrow = date + timedelta(days=1)

        index = Index(name="test")
        index.add(Doc(datefield=date))
        index.add(Doc(datefield=tomorrow))

        results = [x for x in index.search("2020-01-01", document_class=Doc)]
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].datefield, date)
Beispiel #14
0
    def test_ordered_by_rank(self):
        class Doc(Document):
            text = fields.TextField()
            rank = fields.NumberField()

        index = Index(name="test")
        doc1 = index.add(Doc(text="test", rank=100))
        doc2 = index.add(Doc(text="test", rank=50))
        doc3 = index.add(Doc(text="test", rank=150))

        results = list(index.search("test", Doc, order_by="rank"))

        self.assertEqual(results[0].id, doc2)
        self.assertEqual(results[1].id, doc1)
        self.assertEqual(results[2].id, doc3)
Beispiel #15
0
    def test_acronyms(self):
        class Doc(Document):
            text = fields.TextField()

        index = Index(name="test")
        doc1 = index.add(Doc(text="a.b.c"))
        doc2 = index.add(Doc(text="1-2-3"))
        index.add(Doc(text="do-re-mi"))

        results = list(index.search("a.b.c", Doc))
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].id, doc1)

        results = list(index.search("abc", Doc))
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].id, doc1)

        results = list(index.search("a-b-c", Doc))
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].id, doc1)

        results = list(index.search("1-2-3", Doc))
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].id, doc2)
Beispiel #16
0
    def test_number_field_querying(self):
        class Doc(Document):
            number = fields.NumberField()

        index = Index(name="test")

        doc1 = index.add(Doc(number=1))
        doc2 = index.add(Doc(number=2341920))

        results = [x for x in index.search("number:1", document_class=Doc)]

        # Should only return the exact match
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].id, doc1)

        results = [x for x in index.search("1", document_class=Doc)]

        # Should only return the exact match
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].id, doc1)

        results = [x for x in index.search("2341920", document_class=Doc)]
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].id, doc2)