예제 #1
0
 def testThreeElementPipeline(self):
     lexicon = Lexicon(Splitter(), StopWordPipelineElement({'and': 1}),
                       StupidPipelineElement('dogs', 'fish'),
                       WackyReversePipelineElement('fish'))
     wids = lexicon.sourceToWordIds('cats and dogs')
     wids = lexicon.termToWordIds('hsif')
     self.assertEqual(wids, [2])
예제 #2
0
 def testTwoElementPipeline(self):
     lexicon = Lexicon(Splitter(),
                       StupidPipelineElement('cats', 'fish'),
                       WackyReversePipelineElement('fish'))
     wids = lexicon.sourceToWordIds('cats and dogs')
     wids = lexicon.termToWordIds('hsif')
     self.assertEqual(wids, [1])
예제 #3
0
    def testTermToWordIdsWithProcess_post_glob(self):
        """This test is for added process_post_glob"""
        class AddedSplitter(Splitter):
            def process_post_glob(self, lst):
                assert lst == ['dogs']
                return ['dogs']

        lexicon = Lexicon(AddedSplitter())
        wids = lexicon.sourceToWordIds('cats and dogs')
        wids = lexicon.termToWordIds('dogs')
        self.assertEqual(wids, [3])
예제 #4
0
class IndexTest(TestCase):

    def setUp(self):
        self.lexicon = Lexicon(Splitter())
        self.index = self.IndexFactory(self.lexicon)

    def test_index_document(self, DOCID=1):
        doc = "simple document contains five words"
        self.assert_(not self.index.has_doc(DOCID))
        self.index.index_doc(DOCID, doc)
        self.assert_(self.index.has_doc(DOCID))
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._docweight), 1)
        self.assertEqual(len(self.index._wordinfo), 5)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 5)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        for map in self.index._wordinfo.values():
            self.assertEqual(len(map), 1)
            self.assert_(map.has_key(DOCID))

    def test_unindex_document(self):
        DOCID = 1
        self.test_index_document(DOCID)
        self.index.unindex_doc(DOCID)
        self.assertEqual(len(self.index._docweight), 0)
        self.assertEqual(len(self.index._wordinfo), 0)
        self.assertEqual(len(self.index._docwords), 0)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())

    def test_index_two_documents(self):
        self.test_index_document()
        doc = "another document just four"
        DOCID = 2
        self.index.index_doc(DOCID, doc)
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._docweight), 2)
        self.assertEqual(len(self.index._wordinfo), 8)
        self.assertEqual(len(self.index._docwords), 2)
        self.assertEqual(len(self.index.get_words(DOCID)), 4)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        wids = self.lexicon.termToWordIds("document")
        self.assertEqual(len(wids), 1)
        document_wid = wids[0]
        for wid, map in self.index._wordinfo.items():
            if wid == document_wid:
                self.assertEqual(len(map), 2)
                self.assert_(map.has_key(1))
                self.assert_(map.has_key(DOCID))
            else:
                self.assertEqual(len(map), 1)

    def test_index_two_unindex_one(self):
        # index two documents, unindex one, and test the results
        self.test_index_two_documents()
        self.index.unindex_doc(1)
        DOCID = 2
        self.assertEqual(len(self.index._docweight), 1)
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._wordinfo), 4)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 4)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        for map in self.index._wordinfo.values():
            self.assertEqual(len(map), 1)
            self.assert_(map.has_key(DOCID))

    def test_index_duplicated_words(self, DOCID=1):
        doc = "very simple repeat repeat repeat document test"
        self.index.index_doc(DOCID, doc)
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._wordinfo), 5)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 7)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        wids = self.lexicon.termToWordIds("repeat")
        self.assertEqual(len(wids), 1)
        repititive_wid = wids[0]
        for wid, map in self.index._wordinfo.items():
            self.assertEqual(len(map), 1)
            self.assert_(map.has_key(DOCID))

    def test_simple_query_oneresult(self):
        self.index.index_doc(1, 'not the same document')
        results = self.index.search("document")
        self.assertEqual(list(results.keys()), [1])

    def test_simple_query_noresults(self):
        self.index.index_doc(1, 'not the same document')
        results = self.index.search("frobnicate")
        self.assertEqual(list(results.keys()), [])

    def test_query_oneresult(self):
        self.index.index_doc(1, 'not the same document')
        self.index.index_doc(2, 'something about something else')
        results = self.index.search("document")
        self.assertEqual(list(results.keys()), [1])

    def test_search_phrase(self):
        self.index.index_doc(1, "the quick brown fox jumps over the lazy dog")
        self.index.index_doc(2, "the quick fox jumps lazy over the brown dog")
        results = self.index.search_phrase("quick brown fox")
        self.assertEqual(list(results.keys()), [1])

    def test_search_glob(self):
        self.index.index_doc(1, "how now brown cow")
        self.index.index_doc(2, "hough nough browne cough")
        self.index.index_doc(3, "bar brawl")
        results = self.index.search_glob("bro*")
        self.assertEqual(list(results.keys()), [1, 2])
        results = self.index.search_glob("b*")
        self.assertEqual(list(results.keys()), [1, 2, 3])
예제 #5
0
 def testMissingTermToWordIds(self):
     lexicon = Lexicon(Splitter())
     wids = lexicon.sourceToWordIds('cats and dogs')
     wids = lexicon.termToWordIds('boxes')
     self.assertEqual(wids, [0])
예제 #6
0
 def testTermToWordIds(self):
     lexicon = Lexicon(Splitter())
     wids = lexicon.sourceToWordIds('cats and dogs')
     wids = lexicon.termToWordIds('dogs')
     self.assertEqual(wids, [3])
예제 #7
0
 def testSplitterAdaptorNofold(self):
     lexicon = Lexicon(Splitter())
     wids = lexicon.sourceToWordIds('CATS and dogs')
     wids = lexicon.termToWordIds('cats and dogs')
     self.assertEqual(wids, [0, 2, 3])
예제 #8
0
 def testOnePipelineElement(self):
     lexicon = Lexicon(Splitter(), StupidPipelineElement('dogs', 'fish'))
     wids = lexicon.sourceToWordIds('cats and dogs')
     wids = lexicon.termToWordIds('fish')
     self.assertEqual(wids, [3])
예제 #9
0
class IndexTest(object):
    # Subclasses must set a class variable IndexFactory to the appropriate
    # index object constructor.
    IndexFactory = None

    def setUp(self):
        self.lexicon = Lexicon(Splitter())
        self.index = self.IndexFactory(self.lexicon)

    def test_index_document(self, docid=1):
        doc = 'simple document contains five words'
        self.assertFalse(self.index.has_doc(docid))
        self.index.index_doc(docid, doc)
        self.assertTrue(self.index.has_doc(docid))
        self.assertTrue(self.index._docweight[docid])
        self.assertEqual(len(self.index._docweight), 1)
        self.assertEqual(
            len(self.index._docweight), self.index.document_count())
        self.assertEqual(len(self.index._wordinfo), 5)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(docid)), 5)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        for map in self.index._wordinfo.values():
            self.assertEqual(len(map), 1)
            self.assertIn(docid, map)

    def test_unindex_document(self):
        docid = 1
        self.test_index_document(docid)
        self.index.unindex_doc(docid)
        self.assertEqual(len(self.index._docweight), 0)
        self.assertEqual(
            len(self.index._docweight), self.index.document_count())
        self.assertEqual(len(self.index._wordinfo), 0)
        self.assertEqual(len(self.index._docwords), 0)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())

    def test_index_two_documents(self):
        self.test_index_document()
        doc = 'another document just four'
        docid = 2
        self.index.index_doc(docid, doc)
        self.assertTrue(self.index._docweight[docid])
        self.assertEqual(len(self.index._docweight), 2)
        self.assertEqual(
            len(self.index._docweight), self.index.document_count())
        self.assertEqual(len(self.index._wordinfo), 8)
        self.assertEqual(len(self.index._docwords), 2)
        self.assertEqual(len(self.index.get_words(docid)), 4)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        wids = self.lexicon.termToWordIds('document')
        self.assertEqual(len(wids), 1)
        document_wid = wids[0]
        for wid, map in self.index._wordinfo.items():
            if wid == document_wid:
                self.assertEqual(len(map), 2)
                self.assertIn(1, map)
                self.assertIn(docid, map)
            else:
                self.assertEqual(len(map), 1)

    def test_index_two_unindex_one(self):
        # index two documents, unindex one, and test the results
        self.test_index_two_documents()
        self.index.unindex_doc(1)
        docid = 2
        self.assertEqual(len(self.index._docweight), 1)
        self.assertEqual(
            len(self.index._docweight), self.index.document_count())
        self.assertTrue(self.index._docweight[docid])
        self.assertEqual(len(self.index._wordinfo), 4)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(docid)), 4)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        for map in self.index._wordinfo.values():
            self.assertEqual(len(map), 1)
            self.assertIn(docid, map)

    def test_index_duplicated_words(self, docid=1):
        doc = 'very simple repeat repeat repeat document test'
        self.index.index_doc(docid, doc)
        self.assertTrue(self.index._docweight[docid])
        self.assertEqual(len(self.index._wordinfo), 5)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(docid)), 7)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        self.assertEqual(
            len(self.index._docweight), self.index.document_count())
        wids = self.lexicon.termToWordIds('repeat')
        self.assertEqual(len(wids), 1)
        for wid, map in self.index._wordinfo.items():
            self.assertEqual(len(map), 1)
            self.assertIn(docid, map)

    def test_simple_query_oneresult(self):
        self.index.index_doc(1, 'not the same document')
        results = self.index.search('document')
        self.assertEqual(list(results.keys()), [1])

    def test_simple_query_noresults(self):
        self.index.index_doc(1, 'not the same document')
        results = self.index.search('frobnicate')
        self.assertEqual(list(results.keys()), [])

    def test_query_oneresult(self):
        self.index.index_doc(1, 'not the same document')
        self.index.index_doc(2, 'something about something else')
        results = self.index.search('document')
        self.assertEqual(list(results.keys()), [1])

    def test_search_phrase(self):
        self.index.index_doc(1, 'the quick brown fox jumps over the lazy dog')
        self.index.index_doc(2, 'the quick fox jumps lazy over the brown dog')
        results = self.index.search_phrase('quick brown fox')
        self.assertEqual(list(results.keys()), [1])

    def test_search_glob(self):
        self.index.index_doc(1, 'how now brown cow')
        self.index.index_doc(2, 'hough nough browne cough')
        self.index.index_doc(3, 'bar brawl')
        results = self.index.search_glob('bro*')
        self.assertEqual(list(results.keys()), [1, 2])
        results = self.index.search_glob('b*')
        self.assertEqual(list(results.keys()), [1, 2, 3])
예제 #10
0
class IndexTest(TestCase):

    def setUp(self):
        self.lexicon = Lexicon(Splitter())
        self.index = self.IndexFactory(self.lexicon)

    def test_index_document(self, DOCID=1):
        doc = "simple document contains five words"
        self.assert_(not self.index.has_doc(DOCID))
        self.index.index_doc(DOCID, doc)
        self.assert_(self.index.has_doc(DOCID))
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._docweight), 1)
        self.assertEqual(
            len(self.index._docweight), self.index.document_count())
        self.assertEqual(len(self.index._wordinfo), 5)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 5)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        for map in self.index._wordinfo.values():
            self.assertEqual(len(map), 1)
            self.assert_(map.has_key(DOCID))

    def test_unindex_document(self):
        DOCID = 1
        self.test_index_document(DOCID)
        self.index.unindex_doc(DOCID)
        self.assertEqual(len(self.index._docweight), 0)
        self.assertEqual(
            len(self.index._docweight), self.index.document_count())
        self.assertEqual(len(self.index._wordinfo), 0)
        self.assertEqual(len(self.index._docwords), 0)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())

    def test_index_two_documents(self):
        self.test_index_document()
        doc = "another document just four"
        DOCID = 2
        self.index.index_doc(DOCID, doc)
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._docweight), 2)
        self.assertEqual(
            len(self.index._docweight), self.index.document_count())
        self.assertEqual(len(self.index._wordinfo), 8)
        self.assertEqual(len(self.index._docwords), 2)
        self.assertEqual(len(self.index.get_words(DOCID)), 4)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        wids = self.lexicon.termToWordIds("document")
        self.assertEqual(len(wids), 1)
        document_wid = wids[0]
        for wid, map in self.index._wordinfo.items():
            if wid == document_wid:
                self.assertEqual(len(map), 2)
                self.assert_(map.has_key(1))
                self.assert_(map.has_key(DOCID))
            else:
                self.assertEqual(len(map), 1)

    def test_index_two_unindex_one(self):
        # index two documents, unindex one, and test the results
        self.test_index_two_documents()
        self.index.unindex_doc(1)
        DOCID = 2
        self.assertEqual(len(self.index._docweight), 1)
        self.assertEqual(
            len(self.index._docweight), self.index.document_count())
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._wordinfo), 4)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 4)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        for map in self.index._wordinfo.values():
            self.assertEqual(len(map), 1)
            self.assert_(map.has_key(DOCID))

    def test_index_duplicated_words(self, DOCID=1):
        doc = "very simple repeat repeat repeat document test"
        self.index.index_doc(DOCID, doc)
        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._wordinfo), 5)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 7)
        self.assertEqual(len(self.index._wordinfo),
                         self.index.length())
        self.assertEqual(
            len(self.index._docweight), self.index.document_count())
        wids = self.lexicon.termToWordIds("repeat")
        self.assertEqual(len(wids), 1)
        repititive_wid = wids[0]
        for wid, map in self.index._wordinfo.items():
            self.assertEqual(len(map), 1)
            self.assert_(map.has_key(DOCID))

    def test_simple_query_oneresult(self):
        self.index.index_doc(1, 'not the same document')
        results = self.index.search("document")
        self.assertEqual(list(results.keys()), [1])

    def test_simple_query_noresults(self):
        self.index.index_doc(1, 'not the same document')
        results = self.index.search("frobnicate")
        self.assertEqual(list(results.keys()), [])

    def test_query_oneresult(self):
        self.index.index_doc(1, 'not the same document')
        self.index.index_doc(2, 'something about something else')
        results = self.index.search("document")
        self.assertEqual(list(results.keys()), [1])

    def test_search_phrase(self):
        self.index.index_doc(1, "the quick brown fox jumps over the lazy dog")
        self.index.index_doc(2, "the quick fox jumps lazy over the brown dog")
        results = self.index.search_phrase("quick brown fox")
        self.assertEqual(list(results.keys()), [1])

    def test_search_glob(self):
        self.index.index_doc(1, "how now brown cow")
        self.index.index_doc(2, "hough nough browne cough")
        self.index.index_doc(3, "bar brawl")
        results = self.index.search_glob("bro*")
        self.assertEqual(list(results.keys()), [1, 2])
        results = self.index.search_glob("b*")
        self.assertEqual(list(results.keys()), [1, 2, 3])
예제 #11
0
 def testMissingTermToWordIds(self):
     lexicon = Lexicon(Splitter())
     wids = lexicon.sourceToWordIds('cats and dogs')
     wids = lexicon.termToWordIds('boxes')
     self.assertEqual(wids, [0])
예제 #12
0
 def testTermToWordIds(self):
     lexicon = Lexicon(Splitter())
     wids = lexicon.sourceToWordIds('cats and dogs')
     wids = lexicon.termToWordIds('dogs')
     self.assertEqual(wids, [3])
예제 #13
0
 def testSplitterAdaptorNofold(self):
     lexicon = Lexicon(Splitter())
     wids = lexicon.sourceToWordIds('CATS and dogs')
     wids = lexicon.termToWordIds('cats and dogs')
     self.assertEqual(wids, [0, 2, 3])
예제 #14
0
 def testOnePipelineElement(self):
     lexicon = Lexicon(Splitter(), StupidPipelineElement('dogs', 'fish'))
     wids = lexicon.sourceToWordIds('cats and dogs')
     wids = lexicon.termToWordIds('fish')
     self.assertEqual(wids, [3])