class LuceneTestCase(SeecrTestCase): def setUp(self, fieldRegistry=FieldRegistry()): super(LuceneTestCase, self).setUp() self._javaObjects = self._getJavaObjects() self._reactor = CallTrace('reactor', methods={'addTimer': lambda seconds, callback: CallTrace('timer')}) self._defaultSettings = LuceneSettings(commitCount=1, commitTimeout=1, fieldRegistry=fieldRegistry) self.lucene = Lucene( join(self.tempdir, 'lucene'), reactor=self._reactor, settings=self._defaultSettings, ) self.observer = CallTrace() self.lucene.addObserver(self.observer) def tearDown(self): try: self._reactor.calledMethods.reset() # don't keep any references. self.lucene.close() self.lucene = None gc.collect() diff = self._getJavaObjects() - self._javaObjects self.assertEquals(0, len(diff), diff) finally: SeecrTestCase.tearDown(self) def _getJavaObjects(self): refs = VM._dumpRefs(classes=True) return set( [(c, refs[c]) for c in refs.keys() if c != 'class java.lang.Class' and c != 'class org.apache.lucene.document.Field' and # Fields are kept in FieldRegistry for reusing c != 'class org.apache.lucene.document.NumericDocValuesField' and c != 'class org.apache.lucene.facet.FacetsConfig' ])
class MultiLuceneTest(SeecrTestCase): def __init__(self, *args, **kwargs): super(MultiLuceneTest, self).__init__(*args, **kwargs) self._multithreaded = True def setUp(self): SeecrTestCase.setUp(self) settings = LuceneSettings(multithreaded=self._multithreaded, verbose=False) settingsLuceneC = LuceneSettings(multithreaded=self._multithreaded, verbose=False, similarity=TermFrequencySimilarity()) self.luceneA = Lucene(join(self.tempdir, 'a'), name='coreA', reactor=CallTrace(), settings=settings) self.luceneB = Lucene(join(self.tempdir, 'b'), name='coreB', reactor=CallTrace(), settings=settings) self.luceneC = Lucene(join(self.tempdir, 'c'), name='coreC', reactor=CallTrace(), settings=settingsLuceneC) self.dna = be((Observable(), (MultiLucene(defaultCore='coreA', multithreaded=self._multithreaded), (self.luceneA,), (self.luceneB,), (self.luceneC,), ) )) # +---------------------------------+ +---------------------------------+ +----------------------+ # | ______ | | | | C | # | ____/ \____ A | | __________ B | | ____ | # | / /\ Q /\ \ | | / N \ | | / \ | # | / / \ / \ \ | | / ____ \ | | | R | | # | / | \ / | \ | | | / \ | | | \ ___/ | # | / \ \/ / \ | | | | M __|____|_____ | | | # | / \ /\ / \ | | | \__/_/ | \ | | | # | | \_|__|_/ | | | \ | / | | | | # | | U | | M | | | \___|______/ ___|_______ | | | # | | \ / | | | | / | \ | | | # | \ \/ / | | | O / _|__ \ | | | # | \ /\ / | | \_______|___/_/ \ | | | | # | \ / \ / | | | | M | P | | | | # | \______/ \______/ | | | \____/ | | | | # | | | \ / | | | # | | | \__________/ | | | # +---------------------------------+ +---------------------------------+ +----------------------+ k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11 = range(1,12) self.addDocument(self.luceneA, identifier='A', keys=[('A', k1 )], fields=[('M', 'false'), ('Q', 'false'), ('U', 'false'), ('S', '1')]) self.addDocument(self.luceneA, identifier='A-U', keys=[('A', k2 )], fields=[('M', 'false'), ('Q', 'false'), ('U', 'true' ), ('S', '2')]) self.addDocument(self.luceneA, identifier='A-Q', keys=[('A', k3 )], fields=[('M', 'false'), ('Q', 'true' ), ('U', 'false'), ('S', '3')]) self.addDocument(self.luceneA, identifier='A-QU', keys=[('A', k4 )], fields=[('M', 'false'), ('Q', 'true' ), ('U', 'true' ), ('S', '4')]) self.addDocument(self.luceneA, identifier='A-M', keys=[('A', k5 ), ('C', k5)], fields=[('M', 'true' ), ('Q', 'false'), ('U', 'false'), ('S', '5')]) self.addDocument(self.luceneA, identifier='A-MU', keys=[('A', k6 )], fields=[('M', 'true' ), ('Q', 'false'), ('U', 'true' ), ('S', '6')]) self.addDocument(self.luceneA, identifier='A-MQ', keys=[('A', k7 )], fields=[('M', 'true' ), ('Q', 'true' ), ('U', 'false'), ('S', '7')]) self.addDocument(self.luceneA, identifier='A-MQU', keys=[('A', k8 )], fields=[('M', 'true' ), ('Q', 'true' ), ('U', 'true' ), ('S', '8')]) self.addDocument(self.luceneB, identifier='B-N>A-M', keys=[('B', k5 ), ('D', k5)], fields=[('N', 'true' ), ('O', 'true' ), ('P', 'false')]) self.addDocument(self.luceneB, identifier='B-N>A-MU', keys=[('B', k6 )], fields=[('N', 'true' ), ('O', 'false'), ('P', 'false')]) self.addDocument(self.luceneB, identifier='B-N>A-MQ', keys=[('B', k7 )], fields=[('N', 'true' ), ('O', 'true' ), ('P', 'false')]) self.addDocument(self.luceneB, identifier='B-N>A-MQU', keys=[('B', k8 )], fields=[('N', 'true' ), ('O', 'false'), ('P', 'false')]) self.addDocument(self.luceneB, identifier='B-N', keys=[('B', k9 )], fields=[('N', 'true' ), ('O', 'true' ), ('P', 'false')]) self.addDocument(self.luceneB, identifier='B', keys=[('B', k10)], fields=[('N', 'false'), ('O', 'false'), ('P', 'false')]) self.addDocument(self.luceneB, identifier='B-P>A-M', keys=[('B', k5 )], fields=[('N', 'false'), ('O', 'true' ), ('P', 'true' )]) self.addDocument(self.luceneB, identifier='B-P>A-MU', keys=[('B', k6 )], fields=[('N', 'false'), ('O', 'false'), ('P', 'true' )]) self.addDocument(self.luceneB, identifier='B-P>A-MQ', keys=[('B', k7 )], fields=[('N', 'false'), ('O', 'false' ), ('P', 'true' )]) self.addDocument(self.luceneB, identifier='B-P>A-MQU', keys=[('B', k8 )], fields=[('N', 'false'), ('O', 'false'), ('P', 'true' )]) self.addDocument(self.luceneB, identifier='B-P', keys=[('B', k11)], fields=[('N', 'false'), ('O', 'true' ), ('P', 'true' )]) self.addDocument(self.luceneC, identifier='C-R', keys=[('C', k5)], fields=[('R', 'true')]) self.addDocument(self.luceneC, identifier='C-S', keys=[('C', k8)], fields=[('S', 'true')]) self.addDocument(self.luceneC, identifier='C-S2', keys=[('C', k7)], fields=[('S', 'false')]) self.luceneA._realCommit() self.luceneB._realCommit() self.luceneC._realCommit() settings.commitCount = 1 settingsLuceneC.commitCount = 1 def tearDown(self): self.luceneA.close() self.luceneB.close() SeecrTestCase.tearDown(self) def hitIds(self, hits): return set([hit.id for hit in hits]) def testQueryOneIndex(self): result = returnValueFromGenerator(self.dna.any.executeQuery(luceneQuery=luceneQueryFromCql('Q=true'))) self.assertEquals(set(['A-Q', 'A-QU', 'A-MQ', 'A-MQU']), self.hitIds(result.hits)) result = returnValueFromGenerator(self.dna.any.executeQuery(luceneQuery=luceneQueryFromCql('Q=true AND M=true'))) self.assertEquals(set(['A-MQ', 'A-MQU']), self.hitIds(result.hits)) def testQueryOneIndexWithComposedQuery(self): cq = ComposedQuery('coreA') cq.setCoreQuery(core='coreA', query=luceneQueryFromCql('Q=true')) result = returnValueFromGenerator(self.dna.any.executeComposedQuery(cq)) self.assertEquals(set(['A-Q', 'A-QU', 'A-MQ', 'A-MQU']), self.hitIds(result.hits)) cq = ComposedQuery('coreA') cq.setCoreQuery(core='coreA', query=luceneQueryFromCql('Q=true'), filterQueries=[luceneQueryFromCql('M=true')]) result = returnValueFromGenerator(self.dna.any.executeComposedQuery(cq)) self.assertEquals(set(['A-MQ', 'A-MQU']), self.hitIds(result.hits)) def testB_N_is_true(self): result = returnValueFromGenerator(self.dna.any.executeQuery(core='coreB', luceneQuery=luceneQueryFromCql('N=true'))) self.assertEquals(5, result.total) self.assertEquals(set(['B-N', 'B-N>A-M', 'B-N>A-MU', 'B-N>A-MQ', 'B-N>A-MQU']), self.hitIds(result.hits)) def testJoinQuery(self): q = ComposedQuery('coreA', query=MatchAllDocsQuery()) q.setCoreQuery(core='coreB', query=luceneQueryFromCql('N=true')) q.addMatch(dict(core='coreA', uniqueKey=KEY_PREFIX+'A'), dict(core='coreB', key=KEY_PREFIX+'B')) result = returnValueFromGenerator(self.dna.any.executeComposedQuery(q)) self.assertEquals(4, result.total) self.assertEquals(set(['A-M', 'A-MU', 'A-MQ', 'A-MQU']), self.hitIds(result.hits)) def testMultipleJoinQueriesKeepsCachesWithinMaxSize(self): for i in xrange(25): self.addDocument(self.luceneB, identifier=str(i), keys=[('X', i)], fields=[('Y', str(i))]) for i in xrange(25): q = ComposedQuery('coreA', query=MatchAllDocsQuery()) q.setCoreQuery(core='coreB', query=luceneQueryFromCql('Y=%s' % i)) q.addMatch(dict(core='coreA', uniqueKey=KEY_PREFIX+'A'), dict(core='coreB', key=KEY_PREFIX+'X')) ignoredResult = returnValueFromGenerator(self.dna.any.executeComposedQuery(q)) def testJoinQueryWithFilters(self): q = ComposedQuery('coreA') q.addFilterQuery('coreB', query=luceneQueryFromCql('N=true')) q.addMatch(dict(core='coreA', uniqueKey=KEY_PREFIX+'A'), dict(core='coreB', key=KEY_PREFIX+'B')) result = returnValueFromGenerator(self.dna.any.executeComposedQuery(q)) self.assertEquals(4, result.total) self.assertEquals(set(['A-M', 'A-MU', 'A-MQ', 'A-MQU']), self.hitIds(result.hits)) def testJoinFacet(self): q = ComposedQuery('coreA', query=luceneQueryFromCql('Q=true')) q.addFacet('coreB', dict(fieldname='cat_N', maxTerms=10)) q.addFacet('coreB', dict(fieldname='cat_O', maxTerms=10)) q.addMatch(dict(core='coreA', uniqueKey=KEY_PREFIX + 'A'), dict(core='coreB', key=KEY_PREFIX + 'B')) result = returnValueFromGenerator(self.dna.any.executeComposedQuery(query=q)) self.assertEquals(4, result.total) self.assertEquals([{ 'terms': [ {'count': 2, 'term': u'true'}, {'count': 2, 'term': u'false'}, ], 'path': [], 'fieldname': u'cat_N' }, { 'terms': [ {'count': 3, 'term': u'false'}, {'count': 1, 'term': u'true'}, ], 'path': [], 'fieldname': u'cat_O' }], result.drilldownData) def testJoinFacetWithDrilldownQueryFilters(self): q = ComposedQuery('coreA', query=luceneQueryFromCql('M=true')) q.addDrilldownQuery('coreA', drilldownQuery=('cat_Q', ['true'])) q.addFacet('coreB', dict(fieldname='cat_O', maxTerms=10)) q.addMatch(dict(core='coreA', uniqueKey=KEY_PREFIX + 'A'), dict(core='coreB', key=KEY_PREFIX + 'B')) result = returnValueFromGenerator(self.dna.any.executeComposedQuery(query=q)) self.assertEquals(2, result.total) self.assertEquals([{ 'terms': [ {'count': 3, 'term': u'false'}, {'count': 1, 'term': u'true'}, ], 'path': [], 'fieldname': u'cat_O' }], result.drilldownData) def testJoinFacetWithJoinDrilldownQueryFilters(self): q = ComposedQuery('coreA', query=luceneQueryFromCql('M=true')) q.addDrilldownQuery('coreB', drilldownQuery=('cat_O', ['true'])) q.addFacet('coreB', dict(fieldname='cat_O', maxTerms=10)) q.addMatch(dict(core='coreA', uniqueKey=KEY_PREFIX + 'A'), dict(core='coreB', key=KEY_PREFIX + 'B')) result = returnValueFromGenerator(self.dna.any.executeComposedQuery(query=q)) self.assertEquals(2, result.total) self.assertEquals([{ 'terms': [ {'count': 3, 'term': u'true'}, ], 'path': [], 'fieldname': u'cat_O' }], result.drilldownData) def testJoinDrilldownQueryFilters(self): q = ComposedQuery('coreA', query=luceneQueryFromCql('M=true')) q.addDrilldownQuery('coreB', drilldownQuery=('cat_O', ['true'])) q.addMatch(dict(core='coreA', uniqueKey=KEY_PREFIX + 'A'), dict(core='coreB', key=KEY_PREFIX + 'B')) result = returnValueFromGenerator(self.dna.any.executeComposedQuery(query=q)) self.assertEquals(2, result.total) def testJoinFacetWithFilter(self): q = ComposedQuery('coreA', query=luceneQueryFromCql('M=true')) q.addFilterQuery('coreA', query=luceneQueryFromCql('Q=true')) q.addFacet('coreB', dict(fieldname='cat_O', maxTerms=10)) q.addMatch(dict(core='coreA', uniqueKey=KEY_PREFIX + 'A'), dict(core='coreB', key=KEY_PREFIX + 'B')) result = returnValueFromGenerator(self.dna.any.executeComposedQuery(query=q)) self.assertEquals(2, result.total) self.assertEquals([{ 'terms': [ {'count': 3, 'term': u'false'}, {'count': 1, 'term': u'true'}, ], 'path': [], 'fieldname': u'cat_O' }], result.drilldownData) def testJoinFacetFromBPointOfView(self): q = ComposedQuery('coreB') q.setCoreQuery(core='coreA', query=luceneQueryFromCql('Q=true')) q.setCoreQuery(core='coreB', query=None, facets=[ dict(fieldname='cat_N', maxTerms=10), dict(fieldname='cat_O', maxTerms=10), ]) try: q.addMatch(dict(core='coreA', uniqueKey=KEY_PREFIX + 'A'), dict(core='coreB', key=KEY_PREFIX + 'B')) except ValueError, e: self.assertEquals("Match for result core 'coreB' must have a uniqueKey specification.", str(e)) return # for future reference self.assertEquals(4, result.total) self.assertEquals(set(['B-N>A-MQ', 'B-N>A-MQU', 'B-P>A-MQ', 'B-P>A-MQU']), self.hitIds(result.hits)) self.assertEquals([{ 'terms': [ {'count': 2, 'term': u'false'}, {'count': 2, 'term': u'true'}, ], 'fieldname': u'cat_N' }, { 'terms': [ {'count': 2, 'term': u'false'}, {'count': 2, 'term': u'true'}, ], 'fieldname': u'cat_O' }], result.drilldownData)
class DeDupFilterCollectorTest(SeecrTestCase): def setUp(self): super(DeDupFilterCollectorTest, self).setUp() self._reactor = CallTrace('reactor') settings = LuceneSettings(commitCount=1, verbose=False) self.lucene = Lucene(self.tempdir, reactor=self._reactor, settings=settings) def tearDown(self): self.lucene.close() super(DeDupFilterCollectorTest, self).tearDown() def testCollectorTransparentlyDelegatesToNextCollector(self): self._addDocument("urn:1", 2) tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) self.assertEquals(1, tc.topDocs(0).totalHits) def _addDocument(self, identifier, isformatof, sort=None): doc = Document() if isformatof: doc.add(NumericDocValuesField("__isformatof__", long(isformatof))) if sort: doc.add(NumericDocValuesField("__sort__", long(sort))) consume(self.lucene.addDocument(identifier, doc)) self.lucene.commit() # Explicitly, not required: since commitCount=1. def testCollectorFiltersTwoSimilar(self): self._addDocument("urn:1", 2, 1) self._addDocument("urn:2", 2, 2) tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) topDocsResult = tc.topDocs(0) self.assertEquals(1, topDocsResult.totalHits) self.assertEquals(1, len(topDocsResult.scoreDocs)) docId = topDocsResult.scoreDocs[0].doc key = c.keyForDocId(docId) identifier = self.lucene._index.getDocument( key.getDocId()).get(IDFIELD) self.assertEquals('urn:2', identifier) self.assertEquals(2, key.count) def testCollectorFiltersTwoTimesTwoSimilarOneNot(self): self._addDocument("urn:1", 1, 2001) self._addDocument("urn:2", 3, 2009) # result 2x self._addDocument("urn:3", 50, 2010) # result 1x self._addDocument("urn:4", 3, 2001) self._addDocument("urn:5", 1, 2009) # result 2x #expected: "urn:2', "urn:3" and "urn:5" in no particular order tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) topDocsResult = tc.topDocs(0) self.assertEquals(3, topDocsResult.totalHits) self.assertEquals(3, len(topDocsResult.scoreDocs)) rawDocIds = [scoreDoc.doc for scoreDoc in topDocsResult.scoreDocs] netDocIds = [c.keyForDocId(rawDocId).docId for rawDocId in rawDocIds] identifiers = set( self.lucene._index.getDocument(doc).get(IDFIELD) for doc in netDocIds) self.assertEquals(set(["urn:2", "urn:3", "urn:5"]), identifiers) self.assertEquals( [1, 2, 2], list(sorted(c.keyForDocId(d).count for d in netDocIds))) def testSilentyYieldsWrongResultWhenFieldNameDoesNotMatch(self): self._addDocument("urn:1", 2) tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__wrong_field__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) self.assertEquals(1, tc.topDocs(0).totalHits) def testShouldAddResultsWithoutIsFormatOf(self): self._addDocument("urn:1", 2) self._addDocument("urn:2", None) self._addDocument("urn:3", 2) self._addDocument("urn:4", None) self._addDocument("urn:5", None) self._addDocument("urn:6", None) self._addDocument("urn:7", None) self._addDocument("urn:8", None) self._addDocument("urn:9", None) self._addDocument("urn:A", None) self._addDocument("urn:B", None) # trigger a merge tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) self.assertEquals(10, tc.topDocs(0).totalHits)
class DeDupFilterCollectorTest(SeecrTestCase): def setUp(self): super(DeDupFilterCollectorTest, self).setUp() self._reactor = CallTrace('reactor') settings = LuceneSettings(commitCount=1, verbose=False) self.lucene = Lucene(self.tempdir, reactor=self._reactor, settings=settings) def tearDown(self): self.lucene.close() super(DeDupFilterCollectorTest, self).tearDown() def testCollectorTransparentlyDelegatesToNextCollector(self): self._addDocument("urn:1", 2) tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) self.assertEquals(1, tc.topDocs(0).totalHits) def _addDocument(self, identifier, isformatof, sort=None): doc = Document() if isformatof: doc.add(NumericDocValuesField("__isformatof__", long(isformatof))) if sort: doc.add(NumericDocValuesField("__sort__", long(sort))) consume(self.lucene.addDocument(identifier, doc)) self.lucene.commit() # Explicitly, not required: since commitCount=1. def testCollectorFiltersTwoSimilar(self): self._addDocument("urn:1", 2, 1) self._addDocument("urn:2", 2, 2) tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) topDocsResult = tc.topDocs(0) self.assertEquals(1, topDocsResult.totalHits) self.assertEquals(1, len(topDocsResult.scoreDocs)) docId = topDocsResult.scoreDocs[0].doc key = c.keyForDocId(docId) identifier = self.lucene._index.getDocument(key.getDocId()).get(IDFIELD) self.assertEquals('urn:2', identifier) self.assertEquals(2, key.count) def testCollectorFiltersTwoTimesTwoSimilarOneNot(self): self._addDocument("urn:1", 1, 2001) self._addDocument("urn:2", 3, 2009) # result 2x self._addDocument("urn:3", 50, 2010) # result 1x self._addDocument("urn:4", 3, 2001) self._addDocument("urn:5", 1, 2009) # result 2x #expected: "urn:2', "urn:3" and "urn:5" in no particular order tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) topDocsResult = tc.topDocs(0) self.assertEquals(3, topDocsResult.totalHits) self.assertEquals(3, len(topDocsResult.scoreDocs)) rawDocIds = [scoreDoc.doc for scoreDoc in topDocsResult.scoreDocs] netDocIds = [c.keyForDocId(rawDocId).docId for rawDocId in rawDocIds] identifiers = set(self.lucene._index.getDocument(doc).get(IDFIELD) for doc in netDocIds) self.assertEquals(set(["urn:2", "urn:3", "urn:5"]), identifiers) self.assertEquals([1,2,2], list(sorted(c.keyForDocId(d).count for d in netDocIds))) def testSilentyYieldsWrongResultWhenFieldNameDoesNotMatch(self): self._addDocument("urn:1", 2) tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__wrong_field__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) self.assertEquals(1, tc.topDocs(0).totalHits) def testShouldAddResultsWithoutIsFormatOf(self): self._addDocument("urn:1", 2) self._addDocument("urn:2", None) self._addDocument("urn:3", 2) self._addDocument("urn:4", None) self._addDocument("urn:5", None) self._addDocument("urn:6", None) self._addDocument("urn:7", None) self._addDocument("urn:8", None) self._addDocument("urn:9", None) self._addDocument("urn:A", None) self._addDocument("urn:B", None) # trigger a merge tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) self.assertEquals(10, tc.topDocs(0).totalHits)