def testScore(self): reactor = CallTrace('reactor') settings = LuceneSettings(commitCount=1, similarity=TermFrequencySimilarity(), verbose=False) lucene = Lucene(join(self.tempdir, 'lucene'), reactor=reactor, settings=settings) document = Document() document.add(TextField('field', 'x '*100, Field.Store.NO)) returnValueFromGenerator(lucene.addDocument(identifier="identifier", document=document)) q = TermQuery(Term("field", 'x')) result = returnValueFromGenerator(lucene.executeQuery(q)) self.assertAlmostEqual(0.1, result.hits[0].score) q.setBoost(10.0) result = returnValueFromGenerator(lucene.executeQuery(q)) self.assertAlmostEqual(1, result.hits[0].score)
class DeDupFilterCollectorTest(SeecrTestCase): def setUp(self): super(DeDupFilterCollectorTest, self).setUp() self._reactor = CallTrace('reactor') settings = LuceneSettings(commitCount=1, verbose=False) self.lucene = Lucene(self.tempdir, reactor=self._reactor, settings=settings) def tearDown(self): self.lucene.close() super(DeDupFilterCollectorTest, self).tearDown() def testCollectorTransparentlyDelegatesToNextCollector(self): self._addDocument("urn:1", 2) tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) self.assertEquals(1, tc.topDocs(0).totalHits) def _addDocument(self, identifier, isformatof, sort=None): doc = Document() if isformatof: doc.add(NumericDocValuesField("__isformatof__", long(isformatof))) if sort: doc.add(NumericDocValuesField("__sort__", long(sort))) consume(self.lucene.addDocument(identifier, doc)) self.lucene.commit() # Explicitly, not required: since commitCount=1. def testCollectorFiltersTwoSimilar(self): self._addDocument("urn:1", 2, 1) self._addDocument("urn:2", 2, 2) tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) topDocsResult = tc.topDocs(0) self.assertEquals(1, topDocsResult.totalHits) self.assertEquals(1, len(topDocsResult.scoreDocs)) docId = topDocsResult.scoreDocs[0].doc key = c.keyForDocId(docId) identifier = self.lucene._index.getDocument( key.getDocId()).get(IDFIELD) self.assertEquals('urn:2', identifier) self.assertEquals(2, key.count) def testCollectorFiltersTwoTimesTwoSimilarOneNot(self): self._addDocument("urn:1", 1, 2001) self._addDocument("urn:2", 3, 2009) # result 2x self._addDocument("urn:3", 50, 2010) # result 1x self._addDocument("urn:4", 3, 2001) self._addDocument("urn:5", 1, 2009) # result 2x #expected: "urn:2', "urn:3" and "urn:5" in no particular order tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) topDocsResult = tc.topDocs(0) self.assertEquals(3, topDocsResult.totalHits) self.assertEquals(3, len(topDocsResult.scoreDocs)) rawDocIds = [scoreDoc.doc for scoreDoc in topDocsResult.scoreDocs] netDocIds = [c.keyForDocId(rawDocId).docId for rawDocId in rawDocIds] identifiers = set( self.lucene._index.getDocument(doc).get(IDFIELD) for doc in netDocIds) self.assertEquals(set(["urn:2", "urn:3", "urn:5"]), identifiers) self.assertEquals( [1, 2, 2], list(sorted(c.keyForDocId(d).count for d in netDocIds))) def testSilentyYieldsWrongResultWhenFieldNameDoesNotMatch(self): self._addDocument("urn:1", 2) tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__wrong_field__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) self.assertEquals(1, tc.topDocs(0).totalHits) def testShouldAddResultsWithoutIsFormatOf(self): self._addDocument("urn:1", 2) self._addDocument("urn:2", None) self._addDocument("urn:3", 2) self._addDocument("urn:4", None) self._addDocument("urn:5", None) self._addDocument("urn:6", None) self._addDocument("urn:7", None) self._addDocument("urn:8", None) self._addDocument("urn:9", None) self._addDocument("urn:A", None) self._addDocument("urn:B", None) # trigger a merge tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) self.assertEquals(10, tc.topDocs(0).totalHits)
class LuceneTest(SeecrTestCase): def setUp(self): SeecrTestCase.setUp(self) self.setUpLucene() def setUpLucene(self, **kwargs): self._lucene = Lucene(host="localhost", port=1234, name='lucene', settings=LuceneSettings(), **kwargs) self.post = [] self.response = "" connect = self._lucene._connect() def mockPost(data, path, **kwargs): self.post.append(dict(data=data, path=path)) raise StopIteration(self.response) yield connect._post = mockPost self.read = [] self.response = "" def mockRead(path, **kwargs): self.read.append(path) raise StopIteration(self.response) yield connect.read = mockRead self._lucene._connect = lambda: connect def testPostSettingsAddObserverInit(self): self.assertEqual([], self.post) self._lucene.observer_init() self.assertEqual(1, len(self.post)) self.assertEqual('/lucene/settings/', self.post[0]['path']) self.assertEquals(DEFAULTS, loads(self.post[0]['data'])) def testInitialize(self): self.assertEqual([], self.post) consume(self._lucene.initialize()) self.assertEqual(1, len(self.post)) self.assertEqual('/lucene/settings/', self.post[0]['path']) self.assertEquals(DEFAULTS, loads(self.post[0]['data'])) def testAdd(self): registry = FieldRegistry() fields = [registry.createField("id", "id1")] consume(self._lucene.addDocument(identifier='id1', fields=fields)) self.assertEqual(1, len(self.post)) self.assertEqual('/lucene/update/?identifier=id1', self.post[0]['path']) self.assertEqual('[{"type": "TextField", "name": "id", "value": "id1"}]', self.post[0]['data']) def testAddWithoutIdentifier(self): registry = FieldRegistry() fields = [registry.createField("id", "id1")] consume(self._lucene.addDocument(fields=fields)) self.assertEqual(1, len(self.post)) self.assertEqual('/lucene/update/?', self.post[0]['path']) self.assertEqual('[{"type": "TextField", "name": "id", "value": "id1"}]', self.post[0]['data']) def testDelete(self): consume(self._lucene.delete(identifier='id1')) self.assertEqual(1, len(self.post)) self.assertEqual('/lucene/delete/?identifier=id1', self.post[0]['path']) self.assertEqual(None, self.post[0]['data']) def testDeleteByQuery(self): query = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value")) consume(self._lucene.delete(luceneQuery=query)) self.assertEqual(1, len(self.post)) self.assertEqual('/lucene/delete/', self.post[0]['path']) self.assertEqual('{"query": {"term": {"field": "field", "value": "value"}, "type": "TermQuery"}}', self.post[0]['data']) def testExecuteQuery(self): self.response = JsonDict({ "total": 887, "queryTime": 6, "times": {"searchTime": 3}, "hits": [{ "id": "record:1", "score": 0.1234, "duplicateCount": {"__key__": 2}, "duplicates": {"__grouping_key__": [{"id": 'record:1'}, {"id": 'record:2'}]} }], "drilldownData": [ {"fieldname": "facet", "path": [], "terms": [{"term": "term", "count": 1}]} ], "suggestions": { "valeu": ["value"] } }).dumps() query = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value")) response = retval(self._lucene.executeQuery( luceneQuery=query, start=1, stop=5, facets=[dict(maxTerms=10, fieldname='facet')], sortKeys=[dict(sortBy='field', sortDescending=False)], suggestionRequest=dict(suggests=['valeu'], count=2, field='field1'), dedupField="__key__", clustering=True, storedFields=["field"] )) self.assertEqual(1, len(self.post)) self.assertEqual('/lucene/query/', self.post[0]['path']) self.assertEqual({ "start": 1, "stop": 5, "storedFields": ["field"], "query": {"term": {"field": "field", "value": "value"}, "type": "TermQuery"}, "facets": [{"fieldname": "facet", "maxTerms": 10}], "sortKeys": [{"sortBy": "field", "sortDescending": False, "type": "String", 'missingValue': 'STRING_LAST'}], "suggestionRequest": dict(suggests=['valeu'], count=2, field='field1'), "dedupField": "__key__", "dedupSortField": None, "clustering": True, }, loads(self.post[0]['data'])) self.assertEqual(887, response.total) self.assertEqual(6, response.queryTime) self.assertEqual({'searchTime': 3}, response.times) self.assertEqual(1, len(response.hits)) self.assertEqual("record:1", response.hits[0].id) self.assertEqual(0.1234, response.hits[0].score) self.assertEqual(dict(__key__=2), response.hits[0].duplicateCount) self.assertEqual([ {"fieldname": "facet", "path": [], "terms": [{"term": "term", "count": 1}]} ], response.drilldownData) self.assertEqual({'valeu': ['value']}, response.suggestions) def testPrefixSearch(self): self.response = JsonList([["value0", 1], ["value1", 2]]).dumps() response = retval(self._lucene.prefixSearch(fieldname='field1', prefix='valu')) self.assertEquals(['value1', 'value0'], response.hits) response = retval(self._lucene.prefixSearch(fieldname='field1', prefix='valu', showCount=True)) self.assertEquals([('value1', 2), ('value0', 1)], response.hits) def testNumDocs(self): self.response = "150" result = retval(self._lucene.numDocs()) self.assertEqual(150, result) self.assertEqual([{'data': None, 'path': '/lucene/numDocs/'}], self.post) def testFieldnames(self): self.response = '["field1", "field2"]' result = retval(self._lucene.fieldnames()) self.assertEqual(["field1", "field2"], result.hits) self.assertEqual([{"data": None, "path": "/lucene/fieldnames/"}], self.post) def testDrilldownFieldnames(self): self.response = '["field1", "field2"]' result = retval(self._lucene.drilldownFieldnames()) self.assertEqual(["field1", "field2"], result.hits) self.assertEqual([{"data": None, "path": "/lucene/drilldownFieldnames/?limit=50"}], self.post) result = retval(self._lucene.drilldownFieldnames(limit=1, path=['field'])) self.assertEqual(["field1", "field2"], result.hits) self.assertEqual({"data": None, "path": "/lucene/drilldownFieldnames/?dim=field&limit=1"}, self.post[-1]) result = retval(self._lucene.drilldownFieldnames(limit=1, path=['xyz', 'abc', 'field'])) self.assertEqual(["field1", "field2"], result.hits) self.assertEqual({"data": None, "path": "/lucene/drilldownFieldnames/?dim=xyz&limit=1&path=abc&path=field"}, self.post[-1]) def testUpdateSettings(self): self.response = JsonDict(numberOfConcurrentTasks=6, similarity="BM25(k1=1.2,b=0.75)", clustering=JsonDict(clusterMoreRecords=100, clusteringEps=0.4, clusteringMinPoints=1)) settings = retval(self._lucene.getSettings()) self.assertEqual(['/settings/'], self.read) self.assertEquals({'numberOfConcurrentTasks': 6, 'similarity': u'BM25(k1=1.2,b=0.75)', 'clustering': {'clusterMoreRecords': 100, 'clusteringEps': 0.4, 'clusteringMinPoints': 1}}, settings) clusterFields = [ {"filterValue": None, "fieldname": "untokenized.dcterms:isFormatOf.uri", "weight": 0} ] self.response = "" consume(self._lucene.setSettings(similarity=dict(name="bm25", k1=1.0, b=2.0), numberOfConcurrentTasks=10, clustering=dict(clusterMoreRecords=200, clusteringEps=1.0, clusteringMinPoints=2, fields=clusterFields))) self.assertEqual(1, len(self.post)) self.assertEqual('/lucene/settings/', self.post[0]['path']) self.assertEqual({ "numberOfConcurrentTasks": 10, "similarity": dict(type="BM25Similarity", k1=1.0, b=2.0), "clustering": { "clusterMoreRecords": 200, "clusteringEps": 1.0, "clusteringMinPoints": 2, "fields": [ {"filterValue": None, "fieldname": "untokenized.dcterms:isFormatOf.uri", "weight": 0} ] } }, loads(self.post[0]['data'])) consume(self._lucene.setSettings(numberOfConcurrentTasks=5, similarity=None, clustering=None)) self.assertEqual(2, len(self.post)) self.assertEqual('/lucene/settings/', self.post[1]['path']) self.assertEqual({ "numberOfConcurrentTasks": 5, }, loads(self.post[1]['data'])) def testSimilarDocs(self): self.response = JsonDict({ "total": 887, "queryTime": 6, "times": {"searchTime": 3}, "hits": [ {"id": "record:1", "score": 0.1234}, {"id": "record:2", "score": 0.1234}, ], }).dumps() response = retval(self._lucene.similarDocuments(identifier='record:3')) self.assertEqual(887, response.total) self.assertEqual(2, len(response.hits)) def testLuceneReadonly(self): self.setUpLucene(readonly=True) self._lucene.observer_init() self.assertEqual([], self.post) self.assertRaises(RuntimeError, lambda: consume(self._lucene.setSettings())) self.assertRaises(RuntimeError, lambda: consume(self._lucene.addDocument(fields=[]))) self.assertRaises(RuntimeError, lambda: consume(self._lucene.delete('identifier'))) def testLuceneServerHostPortDynamic(self): lucene = Lucene(name='lucene', settings=LuceneSettings(), readonly=True) def httprequest1_1Mock(**kwargs): raise StopIteration(parseResponse(HTTP_RESPONSE)) yield observer = CallTrace( 'observer', returnValues=dict(luceneServer=('example.org', 1234)), methods=dict(httprequest1_1=httprequest1_1Mock)) lucene.addObserver(observer) query = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value")) response = retval(lucene.executeQuery( luceneQuery=query, start=1, stop=5, )) self.assertEquals(887, response.total) self.assertEquals(['luceneServer', 'httprequest1_1'], observer.calledMethodNames())
class LuceneTest(SeecrTestCase): def setUp(self): SeecrTestCase.setUp(self) self._lucene = Lucene(host="localhost", port=1234, name='lucene', settings=LuceneSettings()) self.post = [] self.response = "" def mockPost(data, path, **kwargs): self.post.append(dict(data=data, path=path)) raise StopIteration(self.response) yield self._lucene._connect._post = mockPost self.read = [] self.response = "" def mockRead(path, **kwargs): self.read.append(path) raise StopIteration(self.response) yield self._lucene._connect.read = mockRead def testPostSettingsAddObserverInit(self): self.assertEqual([], self.post) self._lucene.observer_init() self.assertEqual(1, len(self.post)) self.assertEqual('/lucene/settings/', self.post[0]['path']) self.assertEquals(DEFAULTS, loads(self.post[0]['data'])) def testInitialize(self): self.assertEqual([], self.post) consume(self._lucene.initialize()) self.assertEqual(1, len(self.post)) self.assertEqual('/lucene/settings/', self.post[0]['path']) self.assertEquals(DEFAULTS, loads(self.post[0]['data'])) def testAdd(self): registry = FieldRegistry() fields = [registry.createField("id", "id1")] consume(self._lucene.addDocument(identifier='id1', fields=fields)) self.assertEqual(1, len(self.post)) self.assertEqual('/lucene/update/?identifier=id1', self.post[0]['path']) self.assertEqual('[{"type": "TextField", "name": "id", "value": "id1"}]', self.post[0]['data']) def testAddWithoutIdentifier(self): registry = FieldRegistry() fields = [registry.createField("id", "id1")] consume(self._lucene.addDocument(fields=fields)) self.assertEqual(1, len(self.post)) self.assertEqual('/lucene/update/?', self.post[0]['path']) self.assertEqual('[{"type": "TextField", "name": "id", "value": "id1"}]', self.post[0]['data']) def testDelete(self): consume(self._lucene.delete(identifier='id1')) self.assertEqual(1, len(self.post)) self.assertEqual('/lucene/delete/?identifier=id1', self.post[0]['path']) self.assertEqual(None, self.post[0]['data']) def testExecuteQuery(self): self.response = JsonDict({ "total": 887, "queryTime": 6, "times": {"searchTime": 3}, "hits": [{ "id": "record:1", "score": 0.1234, "duplicateCount": {"__key__": 2}, "duplicates": {"__grouping_key__": [{"id": 'record:1'}, {"id": 'record:2'}]} }], "drilldownData": [ {"fieldname": "facet", "path": [], "terms": [{"term": "term", "count": 1}]} ], "suggestions": { "valeu": ["value"] } }).dumps() query = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value")) response = retval(self._lucene.executeQuery( luceneQuery=query, start=1, stop=5, facets=[dict(maxTerms=10, fieldname='facet')], sortKeys=[dict(sortBy='field', sortDescending=False)], suggestionRequest=dict(suggests=['valeu'], count=2, field='field1'), dedupField="__key__", clustering=True, storedFields=["field"] )) self.assertEqual(1, len(self.post)) self.assertEqual('/lucene/query/', self.post[0]['path']) self.assertEqual({ "start": 1, "stop": 5, "storedFields": ["field"], "query": {"term": {"field": "field", "value": "value"}, "type": "TermQuery"}, "facets": [{"fieldname": "facet", "maxTerms": 10}], "sortKeys": [{"sortBy": "field", "sortDescending": False, "type": "String", 'missingValue': 'STRING_LAST'}], "suggestionRequest": dict(suggests=['valeu'], count=2, field='field1'), "dedupField": "__key__", "dedupSortField": None, "clustering": True, }, loads(self.post[0]['data'])) self.assertEqual(887, response.total) self.assertEqual(6, response.queryTime) self.assertEqual({'searchTime': 3}, response.times) self.assertEqual(1, len(response.hits)) self.assertEqual("record:1", response.hits[0].id) self.assertEqual(0.1234, response.hits[0].score) self.assertEqual(dict(__key__=2), response.hits[0].duplicateCount) self.assertEqual([ {"fieldname": "facet", "path": [], "terms": [{"term": "term", "count": 1}]} ], response.drilldownData) self.assertEqual({'valeu': ['value']}, response.suggestions) def testPrefixSearch(self): self.response = JsonList([["value0", 1], ["value1", 2]]).dumps() response = retval(self._lucene.prefixSearch(fieldname='field1', prefix='valu')) self.assertEquals(['value1', 'value0'], response.hits) response = retval(self._lucene.prefixSearch(fieldname='field1', prefix='valu', showCount=True)) self.assertEquals([('value1', 2), ('value0', 1)], response.hits) def testNumDocs(self): self.response = "150" result = retval(self._lucene.numDocs()) self.assertEqual(150, result) self.assertEqual([{'data': None, 'path': '/lucene/numDocs/'}], self.post) def testFieldnames(self): self.response = '["field1", "field2"]' result = retval(self._lucene.fieldnames()) self.assertEqual(["field1", "field2"], result.hits) self.assertEqual([{"data": None, "path": "/lucene/fieldnames/"}], self.post) def testDrilldownFieldnames(self): self.response = '["field1", "field2"]' result = retval(self._lucene.drilldownFieldnames()) self.assertEqual(["field1", "field2"], result.hits) self.assertEqual([{"data": None, "path": "/lucene/drilldownFieldnames/?limit=50"}], self.post) result = retval(self._lucene.drilldownFieldnames(limit=1, path=['field'])) self.assertEqual(["field1", "field2"], result.hits) self.assertEqual({"data": None, "path": "/lucene/drilldownFieldnames/?dim=field&limit=1"}, self.post[-1]) result = retval(self._lucene.drilldownFieldnames(limit=1, path=['xyz', 'abc', 'field'])) self.assertEqual(["field1", "field2"], result.hits) self.assertEqual({"data": None, "path": "/lucene/drilldownFieldnames/?dim=xyz&limit=1&path=abc&path=field"}, self.post[-1]) def testUpdateSettings(self): self.response = JsonDict(numberOfConcurrentTasks=6, similarity="BM25(k1=1.2,b=0.75)", clustering=JsonDict(clusterMoreRecords=100, clusteringEps=0.4, clusteringMinPoints=1)) settings = retval(self._lucene.getSettings()) self.assertEqual(['/settings/'], self.read) self.assertEquals({'numberOfConcurrentTasks': 6, 'similarity': u'BM25(k1=1.2,b=0.75)', 'clustering': {'clusterMoreRecords': 100, 'clusteringEps': 0.4, 'clusteringMinPoints': 1}}, settings) clusterFields = [ {"filterValue": None, "fieldname": "untokenized.dcterms:isFormatOf.uri", "weight": 0} ] self.response = "" consume(self._lucene.setSettings(similarity=dict(name="bm25", k1=1.0, b=2.0), numberOfConcurrentTasks=10, clustering=dict(clusterMoreRecords=200, clusteringEps=1.0, clusteringMinPoints=2, fields=clusterFields))) self.assertEqual(1, len(self.post)) self.assertEqual('/lucene/settings/', self.post[0]['path']) self.assertEqual({ "numberOfConcurrentTasks": 10, "similarity": dict(type="BM25Similarity", k1=1.0, b=2.0), "clustering": { "clusterMoreRecords": 200, "clusteringEps": 1.0, "clusteringMinPoints": 2, "fields": [ {"filterValue": None, "fieldname": "untokenized.dcterms:isFormatOf.uri", "weight": 0} ] } }, loads(self.post[0]['data'])) consume(self._lucene.setSettings(numberOfConcurrentTasks=5, similarity=None, clustering=None)) self.assertEqual(2, len(self.post)) self.assertEqual('/lucene/settings/', self.post[1]['path']) self.assertEqual({ "numberOfConcurrentTasks": 5, }, loads(self.post[1]['data'])) def testSimilarDocs(self): self.response = JsonDict({ "total": 887, "queryTime": 6, "times": {"searchTime": 3}, "hits": [ {"id": "record:1", "score": 0.1234}, {"id": "record:2", "score": 0.1234}, ], }).dumps() response = retval(self._lucene.similarDocuments(identifier='record:3')) self.assertEqual(887, response.total) self.assertEqual(2, len(response.hits))
class DeDupFilterCollectorTest(SeecrTestCase): def setUp(self): super(DeDupFilterCollectorTest, self).setUp() self._reactor = CallTrace('reactor') settings = LuceneSettings(commitCount=1, verbose=False) self.lucene = Lucene(self.tempdir, reactor=self._reactor, settings=settings) def tearDown(self): self.lucene.close() super(DeDupFilterCollectorTest, self).tearDown() def testCollectorTransparentlyDelegatesToNextCollector(self): self._addDocument("urn:1", 2) tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) self.assertEquals(1, tc.topDocs(0).totalHits) def _addDocument(self, identifier, isformatof, sort=None): doc = Document() if isformatof: doc.add(NumericDocValuesField("__isformatof__", long(isformatof))) if sort: doc.add(NumericDocValuesField("__sort__", long(sort))) consume(self.lucene.addDocument(identifier, doc)) self.lucene.commit() # Explicitly, not required: since commitCount=1. def testCollectorFiltersTwoSimilar(self): self._addDocument("urn:1", 2, 1) self._addDocument("urn:2", 2, 2) tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) topDocsResult = tc.topDocs(0) self.assertEquals(1, topDocsResult.totalHits) self.assertEquals(1, len(topDocsResult.scoreDocs)) docId = topDocsResult.scoreDocs[0].doc key = c.keyForDocId(docId) identifier = self.lucene._index.getDocument(key.getDocId()).get(IDFIELD) self.assertEquals('urn:2', identifier) self.assertEquals(2, key.count) def testCollectorFiltersTwoTimesTwoSimilarOneNot(self): self._addDocument("urn:1", 1, 2001) self._addDocument("urn:2", 3, 2009) # result 2x self._addDocument("urn:3", 50, 2010) # result 1x self._addDocument("urn:4", 3, 2001) self._addDocument("urn:5", 1, 2009) # result 2x #expected: "urn:2', "urn:3" and "urn:5" in no particular order tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) topDocsResult = tc.topDocs(0) self.assertEquals(3, topDocsResult.totalHits) self.assertEquals(3, len(topDocsResult.scoreDocs)) rawDocIds = [scoreDoc.doc for scoreDoc in topDocsResult.scoreDocs] netDocIds = [c.keyForDocId(rawDocId).docId for rawDocId in rawDocIds] identifiers = set(self.lucene._index.getDocument(doc).get(IDFIELD) for doc in netDocIds) self.assertEquals(set(["urn:2", "urn:3", "urn:5"]), identifiers) self.assertEquals([1,2,2], list(sorted(c.keyForDocId(d).count for d in netDocIds))) def testSilentyYieldsWrongResultWhenFieldNameDoesNotMatch(self): self._addDocument("urn:1", 2) tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__wrong_field__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) self.assertEquals(1, tc.topDocs(0).totalHits) def testShouldAddResultsWithoutIsFormatOf(self): self._addDocument("urn:1", 2) self._addDocument("urn:2", None) self._addDocument("urn:3", 2) self._addDocument("urn:4", None) self._addDocument("urn:5", None) self._addDocument("urn:6", None) self._addDocument("urn:7", None) self._addDocument("urn:8", None) self._addDocument("urn:9", None) self._addDocument("urn:A", None) self._addDocument("urn:B", None) # trigger a merge tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) self.assertEquals(10, tc.topDocs(0).totalHits)