Ejemplo n.º 1
0
    def testScore(self):
        reactor = CallTrace('reactor')
        settings = LuceneSettings(commitCount=1, similarity=TermFrequencySimilarity(), verbose=False)
        lucene = Lucene(join(self.tempdir, 'lucene'), reactor=reactor, settings=settings)
        document = Document()
        document.add(TextField('field', 'x '*100, Field.Store.NO))
        returnValueFromGenerator(lucene.addDocument(identifier="identifier", document=document))

        q = TermQuery(Term("field", 'x'))
        result = returnValueFromGenerator(lucene.executeQuery(q))
        self.assertAlmostEqual(0.1, result.hits[0].score)

        q.setBoost(10.0)
        result = returnValueFromGenerator(lucene.executeQuery(q))
        self.assertAlmostEqual(1, result.hits[0].score)
Ejemplo n.º 2
0
class DeDupFilterCollectorTest(SeecrTestCase):
    def setUp(self):
        super(DeDupFilterCollectorTest, self).setUp()
        self._reactor = CallTrace('reactor')
        settings = LuceneSettings(commitCount=1, verbose=False)
        self.lucene = Lucene(self.tempdir,
                             reactor=self._reactor,
                             settings=settings)

    def tearDown(self):
        self.lucene.close()
        super(DeDupFilterCollectorTest, self).tearDown()

    def testCollectorTransparentlyDelegatesToNextCollector(self):
        self._addDocument("urn:1", 2)
        tc = TopScoreDocSuperCollector(100, True)
        c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc)
        self.lucene.search(query=MatchAllDocsQuery(), collector=c)
        self.assertEquals(1, tc.topDocs(0).totalHits)

    def _addDocument(self, identifier, isformatof, sort=None):
        doc = Document()
        if isformatof:
            doc.add(NumericDocValuesField("__isformatof__", long(isformatof)))
        if sort:
            doc.add(NumericDocValuesField("__sort__", long(sort)))
        consume(self.lucene.addDocument(identifier, doc))
        self.lucene.commit()  # Explicitly, not required: since commitCount=1.

    def testCollectorFiltersTwoSimilar(self):
        self._addDocument("urn:1", 2, 1)
        self._addDocument("urn:2", 2, 2)
        tc = TopScoreDocSuperCollector(100, True)
        c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc)
        self.lucene.search(query=MatchAllDocsQuery(), collector=c)
        topDocsResult = tc.topDocs(0)
        self.assertEquals(1, topDocsResult.totalHits)
        self.assertEquals(1, len(topDocsResult.scoreDocs))

        docId = topDocsResult.scoreDocs[0].doc
        key = c.keyForDocId(docId)
        identifier = self.lucene._index.getDocument(
            key.getDocId()).get(IDFIELD)
        self.assertEquals('urn:2', identifier)
        self.assertEquals(2, key.count)

    def testCollectorFiltersTwoTimesTwoSimilarOneNot(self):
        self._addDocument("urn:1", 1, 2001)
        self._addDocument("urn:2", 3, 2009)  # result 2x
        self._addDocument("urn:3", 50, 2010)  # result 1x
        self._addDocument("urn:4", 3, 2001)
        self._addDocument("urn:5", 1, 2009)  # result 2x
        #expected: "urn:2', "urn:3" and "urn:5" in no particular order
        tc = TopScoreDocSuperCollector(100, True)
        c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc)
        self.lucene.search(query=MatchAllDocsQuery(), collector=c)
        topDocsResult = tc.topDocs(0)
        self.assertEquals(3, topDocsResult.totalHits)
        self.assertEquals(3, len(topDocsResult.scoreDocs))
        rawDocIds = [scoreDoc.doc for scoreDoc in topDocsResult.scoreDocs]
        netDocIds = [c.keyForDocId(rawDocId).docId for rawDocId in rawDocIds]
        identifiers = set(
            self.lucene._index.getDocument(doc).get(IDFIELD)
            for doc in netDocIds)
        self.assertEquals(set(["urn:2", "urn:3", "urn:5"]), identifiers)
        self.assertEquals(
            [1, 2, 2], list(sorted(c.keyForDocId(d).count for d in netDocIds)))

    def testSilentyYieldsWrongResultWhenFieldNameDoesNotMatch(self):
        self._addDocument("urn:1", 2)
        tc = TopScoreDocSuperCollector(100, True)
        c = DeDupFilterSuperCollector("__wrong_field__", "__sort__", tc)
        self.lucene.search(query=MatchAllDocsQuery(), collector=c)
        self.assertEquals(1, tc.topDocs(0).totalHits)

    def testShouldAddResultsWithoutIsFormatOf(self):
        self._addDocument("urn:1", 2)
        self._addDocument("urn:2", None)
        self._addDocument("urn:3", 2)
        self._addDocument("urn:4", None)
        self._addDocument("urn:5", None)
        self._addDocument("urn:6", None)
        self._addDocument("urn:7", None)
        self._addDocument("urn:8", None)
        self._addDocument("urn:9", None)
        self._addDocument("urn:A", None)
        self._addDocument("urn:B", None)  # trigger a merge
        tc = TopScoreDocSuperCollector(100, True)
        c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc)
        self.lucene.search(query=MatchAllDocsQuery(), collector=c)
        self.assertEquals(10, tc.topDocs(0).totalHits)
Ejemplo n.º 3
0
class LuceneTest(SeecrTestCase):
    def setUp(self):
        SeecrTestCase.setUp(self)
        self.setUpLucene()

    def setUpLucene(self, **kwargs):
        self._lucene = Lucene(host="localhost", port=1234, name='lucene', settings=LuceneSettings(), **kwargs)
        self.post = []
        self.response = ""
        connect = self._lucene._connect()
        def mockPost(data, path, **kwargs):
            self.post.append(dict(data=data, path=path))
            raise StopIteration(self.response)
            yield
        connect._post = mockPost

        self.read = []
        self.response = ""
        def mockRead(path, **kwargs):
            self.read.append(path)
            raise StopIteration(self.response)
            yield
        connect.read = mockRead
        self._lucene._connect = lambda: connect

    def testPostSettingsAddObserverInit(self):
        self.assertEqual([], self.post)
        self._lucene.observer_init()
        self.assertEqual(1, len(self.post))
        self.assertEqual('/lucene/settings/', self.post[0]['path'])
        self.assertEquals(DEFAULTS, loads(self.post[0]['data']))

    def testInitialize(self):
        self.assertEqual([], self.post)
        consume(self._lucene.initialize())
        self.assertEqual(1, len(self.post))
        self.assertEqual('/lucene/settings/', self.post[0]['path'])
        self.assertEquals(DEFAULTS, loads(self.post[0]['data']))

    def testAdd(self):
        registry = FieldRegistry()
        fields = [registry.createField("id", "id1")]
        consume(self._lucene.addDocument(identifier='id1', fields=fields))
        self.assertEqual(1, len(self.post))
        self.assertEqual('/lucene/update/?identifier=id1', self.post[0]['path'])
        self.assertEqual('[{"type": "TextField", "name": "id", "value": "id1"}]', self.post[0]['data'])

    def testAddWithoutIdentifier(self):
        registry = FieldRegistry()
        fields = [registry.createField("id", "id1")]
        consume(self._lucene.addDocument(fields=fields))
        self.assertEqual(1, len(self.post))
        self.assertEqual('/lucene/update/?', self.post[0]['path'])
        self.assertEqual('[{"type": "TextField", "name": "id", "value": "id1"}]', self.post[0]['data'])

    def testDelete(self):
        consume(self._lucene.delete(identifier='id1'))
        self.assertEqual(1, len(self.post))
        self.assertEqual('/lucene/delete/?identifier=id1', self.post[0]['path'])
        self.assertEqual(None, self.post[0]['data'])

    def testDeleteByQuery(self):
        query = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value"))
        consume(self._lucene.delete(luceneQuery=query))
        self.assertEqual(1, len(self.post))
        self.assertEqual('/lucene/delete/', self.post[0]['path'])
        self.assertEqual('{"query": {"term": {"field": "field", "value": "value"}, "type": "TermQuery"}}', self.post[0]['data'])

    def testExecuteQuery(self):
        self.response = JsonDict({
                "total": 887,
                "queryTime": 6,
                "times": {"searchTime": 3},
                "hits": [{
                        "id": "record:1", "score": 0.1234,
                        "duplicateCount": {"__key__": 2},
                        "duplicates": {"__grouping_key__": [{"id": 'record:1'}, {"id": 'record:2'}]}
                    }],
                "drilldownData": [
                    {"fieldname": "facet", "path": [], "terms": [{"term": "term", "count": 1}]}
                ],
                "suggestions": {
                    "valeu": ["value"]
                }
            }).dumps()
        query = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value"))
        response = retval(self._lucene.executeQuery(
                    luceneQuery=query, start=1, stop=5,
                    facets=[dict(maxTerms=10, fieldname='facet')],
                    sortKeys=[dict(sortBy='field', sortDescending=False)],
                    suggestionRequest=dict(suggests=['valeu'], count=2, field='field1'),
                    dedupField="__key__",
                    clustering=True,
                    storedFields=["field"]
                ))
        self.assertEqual(1, len(self.post))
        self.assertEqual('/lucene/query/', self.post[0]['path'])
        self.assertEqual({
                    "start": 1, "stop": 5,
                    "storedFields": ["field"],
                    "query": {"term": {"field": "field", "value": "value"}, "type": "TermQuery"},
                    "facets": [{"fieldname": "facet", "maxTerms": 10}],
                    "sortKeys": [{"sortBy": "field", "sortDescending": False, "type": "String", 'missingValue': 'STRING_LAST'}],
                    "suggestionRequest": dict(suggests=['valeu'], count=2, field='field1'),
                    "dedupField": "__key__",
                    "dedupSortField": None,
                    "clustering": True,
                }, loads(self.post[0]['data']))
        self.assertEqual(887, response.total)
        self.assertEqual(6, response.queryTime)
        self.assertEqual({'searchTime': 3}, response.times)
        self.assertEqual(1, len(response.hits))
        self.assertEqual("record:1", response.hits[0].id)
        self.assertEqual(0.1234, response.hits[0].score)
        self.assertEqual(dict(__key__=2), response.hits[0].duplicateCount)
        self.assertEqual([
                {"fieldname": "facet", "path": [], "terms": [{"term": "term", "count": 1}]}
            ], response.drilldownData)
        self.assertEqual({'valeu': ['value']}, response.suggestions)

    def testPrefixSearch(self):
        self.response = JsonList([["value0", 1], ["value1", 2]]).dumps()
        response = retval(self._lucene.prefixSearch(fieldname='field1', prefix='valu'))
        self.assertEquals(['value1', 'value0'], response.hits)

        response = retval(self._lucene.prefixSearch(fieldname='field1', prefix='valu', showCount=True))
        self.assertEquals([('value1', 2), ('value0', 1)], response.hits)

    def testNumDocs(self):
        self.response = "150"
        result = retval(self._lucene.numDocs())
        self.assertEqual(150, result)
        self.assertEqual([{'data': None, 'path': '/lucene/numDocs/'}], self.post)

    def testFieldnames(self):
        self.response = '["field1", "field2"]'
        result = retval(self._lucene.fieldnames())
        self.assertEqual(["field1", "field2"], result.hits)
        self.assertEqual([{"data": None, "path": "/lucene/fieldnames/"}], self.post)

    def testDrilldownFieldnames(self):
        self.response = '["field1", "field2"]'
        result = retval(self._lucene.drilldownFieldnames())
        self.assertEqual(["field1", "field2"], result.hits)
        self.assertEqual([{"data": None, "path": "/lucene/drilldownFieldnames/?limit=50"}], self.post)

        result = retval(self._lucene.drilldownFieldnames(limit=1, path=['field']))
        self.assertEqual(["field1", "field2"], result.hits)
        self.assertEqual({"data": None, "path": "/lucene/drilldownFieldnames/?dim=field&limit=1"}, self.post[-1])

        result = retval(self._lucene.drilldownFieldnames(limit=1, path=['xyz', 'abc', 'field']))
        self.assertEqual(["field1", "field2"], result.hits)
        self.assertEqual({"data": None, "path": "/lucene/drilldownFieldnames/?dim=xyz&limit=1&path=abc&path=field"}, self.post[-1])

    def testUpdateSettings(self):
        self.response = JsonDict(numberOfConcurrentTasks=6, similarity="BM25(k1=1.2,b=0.75)", clustering=JsonDict(clusterMoreRecords=100, clusteringEps=0.4, clusteringMinPoints=1))
        settings = retval(self._lucene.getSettings())
        self.assertEqual(['/settings/'], self.read)
        self.assertEquals({'numberOfConcurrentTasks': 6, 'similarity': u'BM25(k1=1.2,b=0.75)', 'clustering': {'clusterMoreRecords': 100, 'clusteringEps': 0.4, 'clusteringMinPoints': 1}}, settings)

        clusterFields = [
            {"filterValue": None, "fieldname": "untokenized.dcterms:isFormatOf.uri", "weight": 0}
        ]
        self.response = ""
        consume(self._lucene.setSettings(similarity=dict(name="bm25", k1=1.0, b=2.0), numberOfConcurrentTasks=10, clustering=dict(clusterMoreRecords=200, clusteringEps=1.0, clusteringMinPoints=2, fields=clusterFields)))
        self.assertEqual(1, len(self.post))
        self.assertEqual('/lucene/settings/', self.post[0]['path'])
        self.assertEqual({
                "numberOfConcurrentTasks": 10,
                "similarity": dict(type="BM25Similarity", k1=1.0, b=2.0),
                "clustering": {
                    "clusterMoreRecords": 200,
                    "clusteringEps": 1.0,
                    "clusteringMinPoints": 2,
                    "fields": [
                        {"filterValue": None, "fieldname": "untokenized.dcterms:isFormatOf.uri", "weight": 0}
                    ]
                }
            }, loads(self.post[0]['data']))

        consume(self._lucene.setSettings(numberOfConcurrentTasks=5, similarity=None, clustering=None))
        self.assertEqual(2, len(self.post))
        self.assertEqual('/lucene/settings/', self.post[1]['path'])
        self.assertEqual({
                "numberOfConcurrentTasks": 5,
            }, loads(self.post[1]['data']))

    def testSimilarDocs(self):
        self.response = JsonDict({
                "total": 887,
                "queryTime": 6,
                "times": {"searchTime": 3},
                "hits": [
                        {"id": "record:1", "score": 0.1234},
                        {"id": "record:2", "score": 0.1234},
                    ],
            }).dumps()
        response = retval(self._lucene.similarDocuments(identifier='record:3'))
        self.assertEqual(887, response.total)
        self.assertEqual(2, len(response.hits))

    def testLuceneReadonly(self):
        self.setUpLucene(readonly=True)
        self._lucene.observer_init()
        self.assertEqual([], self.post)
        self.assertRaises(RuntimeError, lambda: consume(self._lucene.setSettings()))
        self.assertRaises(RuntimeError, lambda: consume(self._lucene.addDocument(fields=[])))
        self.assertRaises(RuntimeError, lambda: consume(self._lucene.delete('identifier')))

    def testLuceneServerHostPortDynamic(self):
        lucene = Lucene(name='lucene', settings=LuceneSettings(), readonly=True)
        def httprequest1_1Mock(**kwargs):
            raise StopIteration(parseResponse(HTTP_RESPONSE))
            yield
        observer = CallTrace(
            'observer',
            returnValues=dict(luceneServer=('example.org', 1234)),
            methods=dict(httprequest1_1=httprequest1_1Mock))
        lucene.addObserver(observer)
        query = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value"))
        response = retval(lucene.executeQuery(
            luceneQuery=query, start=1, stop=5,
        ))
        self.assertEquals(887, response.total)
        self.assertEquals(['luceneServer', 'httprequest1_1'], observer.calledMethodNames())
Ejemplo n.º 4
0
class LuceneTest(SeecrTestCase):
    def setUp(self):
        SeecrTestCase.setUp(self)
        self._lucene = Lucene(host="localhost", port=1234, name='lucene', settings=LuceneSettings())
        self.post = []
        self.response = ""
        def mockPost(data, path, **kwargs):
            self.post.append(dict(data=data, path=path))
            raise StopIteration(self.response)
            yield
        self._lucene._connect._post = mockPost

        self.read = []
        self.response = ""
        def mockRead(path, **kwargs):
            self.read.append(path)
            raise StopIteration(self.response)
            yield
        self._lucene._connect.read = mockRead

    def testPostSettingsAddObserverInit(self):
        self.assertEqual([], self.post)
        self._lucene.observer_init()
        self.assertEqual(1, len(self.post))
        self.assertEqual('/lucene/settings/', self.post[0]['path'])
        self.assertEquals(DEFAULTS, loads(self.post[0]['data']))

    def testInitialize(self):
        self.assertEqual([], self.post)
        consume(self._lucene.initialize())
        self.assertEqual(1, len(self.post))
        self.assertEqual('/lucene/settings/', self.post[0]['path'])
        self.assertEquals(DEFAULTS, loads(self.post[0]['data']))

    def testAdd(self):
        registry = FieldRegistry()
        fields = [registry.createField("id", "id1")]
        consume(self._lucene.addDocument(identifier='id1', fields=fields))
        self.assertEqual(1, len(self.post))
        self.assertEqual('/lucene/update/?identifier=id1', self.post[0]['path'])
        self.assertEqual('[{"type": "TextField", "name": "id", "value": "id1"}]', self.post[0]['data'])

    def testAddWithoutIdentifier(self):
        registry = FieldRegistry()
        fields = [registry.createField("id", "id1")]
        consume(self._lucene.addDocument(fields=fields))
        self.assertEqual(1, len(self.post))
        self.assertEqual('/lucene/update/?', self.post[0]['path'])
        self.assertEqual('[{"type": "TextField", "name": "id", "value": "id1"}]', self.post[0]['data'])

    def testDelete(self):
        consume(self._lucene.delete(identifier='id1'))
        self.assertEqual(1, len(self.post))
        self.assertEqual('/lucene/delete/?identifier=id1', self.post[0]['path'])
        self.assertEqual(None, self.post[0]['data'])

    def testExecuteQuery(self):
        self.response = JsonDict({
                "total": 887,
                "queryTime": 6,
                "times": {"searchTime": 3},
                "hits": [{
                        "id": "record:1", "score": 0.1234,
                        "duplicateCount": {"__key__": 2},
                        "duplicates": {"__grouping_key__": [{"id": 'record:1'}, {"id": 'record:2'}]}
                    }],
                "drilldownData": [
                    {"fieldname": "facet", "path": [], "terms": [{"term": "term", "count": 1}]}
                ],
                "suggestions": {
                    "valeu": ["value"]
                }
            }).dumps()
        query = QueryExpressionToLuceneQueryDict([], LuceneSettings()).convert(cqlToExpression("field=value"))
        response = retval(self._lucene.executeQuery(
                    luceneQuery=query, start=1, stop=5,
                    facets=[dict(maxTerms=10, fieldname='facet')],
                    sortKeys=[dict(sortBy='field', sortDescending=False)],
                    suggestionRequest=dict(suggests=['valeu'], count=2, field='field1'),
                    dedupField="__key__",
                    clustering=True,
                    storedFields=["field"]
                ))
        self.assertEqual(1, len(self.post))
        self.assertEqual('/lucene/query/', self.post[0]['path'])
        self.assertEqual({
                    "start": 1, "stop": 5,
                    "storedFields": ["field"],
                    "query": {"term": {"field": "field", "value": "value"}, "type": "TermQuery"},
                    "facets": [{"fieldname": "facet", "maxTerms": 10}],
                    "sortKeys": [{"sortBy": "field", "sortDescending": False, "type": "String", 'missingValue': 'STRING_LAST'}],
                    "suggestionRequest": dict(suggests=['valeu'], count=2, field='field1'),
                    "dedupField": "__key__",
                    "dedupSortField": None,
                    "clustering": True,
                }, loads(self.post[0]['data']))
        self.assertEqual(887, response.total)
        self.assertEqual(6, response.queryTime)
        self.assertEqual({'searchTime': 3}, response.times)
        self.assertEqual(1, len(response.hits))
        self.assertEqual("record:1", response.hits[0].id)
        self.assertEqual(0.1234, response.hits[0].score)
        self.assertEqual(dict(__key__=2), response.hits[0].duplicateCount)
        self.assertEqual([
                {"fieldname": "facet", "path": [], "terms": [{"term": "term", "count": 1}]}
            ], response.drilldownData)
        self.assertEqual({'valeu': ['value']}, response.suggestions)

    def testPrefixSearch(self):
        self.response = JsonList([["value0", 1], ["value1", 2]]).dumps()
        response = retval(self._lucene.prefixSearch(fieldname='field1', prefix='valu'))
        self.assertEquals(['value1', 'value0'], response.hits)

        response = retval(self._lucene.prefixSearch(fieldname='field1', prefix='valu', showCount=True))
        self.assertEquals([('value1', 2), ('value0', 1)], response.hits)

    def testNumDocs(self):
        self.response = "150"
        result = retval(self._lucene.numDocs())
        self.assertEqual(150, result)
        self.assertEqual([{'data': None, 'path': '/lucene/numDocs/'}], self.post)

    def testFieldnames(self):
        self.response = '["field1", "field2"]'
        result = retval(self._lucene.fieldnames())
        self.assertEqual(["field1", "field2"], result.hits)
        self.assertEqual([{"data": None, "path": "/lucene/fieldnames/"}], self.post)

    def testDrilldownFieldnames(self):
        self.response = '["field1", "field2"]'
        result = retval(self._lucene.drilldownFieldnames())
        self.assertEqual(["field1", "field2"], result.hits)
        self.assertEqual([{"data": None, "path": "/lucene/drilldownFieldnames/?limit=50"}], self.post)

        result = retval(self._lucene.drilldownFieldnames(limit=1, path=['field']))
        self.assertEqual(["field1", "field2"], result.hits)
        self.assertEqual({"data": None, "path": "/lucene/drilldownFieldnames/?dim=field&limit=1"}, self.post[-1])

        result = retval(self._lucene.drilldownFieldnames(limit=1, path=['xyz', 'abc', 'field']))
        self.assertEqual(["field1", "field2"], result.hits)
        self.assertEqual({"data": None, "path": "/lucene/drilldownFieldnames/?dim=xyz&limit=1&path=abc&path=field"}, self.post[-1])

    def testUpdateSettings(self):
        self.response = JsonDict(numberOfConcurrentTasks=6, similarity="BM25(k1=1.2,b=0.75)", clustering=JsonDict(clusterMoreRecords=100, clusteringEps=0.4, clusteringMinPoints=1))
        settings = retval(self._lucene.getSettings())
        self.assertEqual(['/settings/'], self.read)
        self.assertEquals({'numberOfConcurrentTasks': 6, 'similarity': u'BM25(k1=1.2,b=0.75)', 'clustering': {'clusterMoreRecords': 100, 'clusteringEps': 0.4, 'clusteringMinPoints': 1}}, settings)

        clusterFields = [
            {"filterValue": None, "fieldname": "untokenized.dcterms:isFormatOf.uri", "weight": 0}
        ]
        self.response = ""
        consume(self._lucene.setSettings(similarity=dict(name="bm25", k1=1.0, b=2.0), numberOfConcurrentTasks=10, clustering=dict(clusterMoreRecords=200, clusteringEps=1.0, clusteringMinPoints=2, fields=clusterFields)))
        self.assertEqual(1, len(self.post))
        self.assertEqual('/lucene/settings/', self.post[0]['path'])
        self.assertEqual({
                "numberOfConcurrentTasks": 10,
                "similarity": dict(type="BM25Similarity", k1=1.0, b=2.0),
                "clustering": {
                    "clusterMoreRecords": 200,
                    "clusteringEps": 1.0,
                    "clusteringMinPoints": 2,
                    "fields": [
                        {"filterValue": None, "fieldname": "untokenized.dcterms:isFormatOf.uri", "weight": 0}
                    ]
                }
            }, loads(self.post[0]['data']))

        consume(self._lucene.setSettings(numberOfConcurrentTasks=5, similarity=None, clustering=None))
        self.assertEqual(2, len(self.post))
        self.assertEqual('/lucene/settings/', self.post[1]['path'])
        self.assertEqual({
                "numberOfConcurrentTasks": 5,
            }, loads(self.post[1]['data']))

    def testSimilarDocs(self):
        self.response = JsonDict({
                "total": 887,
                "queryTime": 6,
                "times": {"searchTime": 3},
                "hits": [
                        {"id": "record:1", "score": 0.1234},
                        {"id": "record:2", "score": 0.1234},
                    ],
            }).dumps()
        response = retval(self._lucene.similarDocuments(identifier='record:3'))
        self.assertEqual(887, response.total)
        self.assertEqual(2, len(response.hits))
class DeDupFilterCollectorTest(SeecrTestCase):
    def setUp(self):
        super(DeDupFilterCollectorTest, self).setUp()
        self._reactor = CallTrace('reactor')
        settings = LuceneSettings(commitCount=1, verbose=False)
        self.lucene = Lucene(self.tempdir, reactor=self._reactor, settings=settings)

    def tearDown(self):
        self.lucene.close()
        super(DeDupFilterCollectorTest, self).tearDown()

    def testCollectorTransparentlyDelegatesToNextCollector(self):
        self._addDocument("urn:1", 2)
        tc = TopScoreDocSuperCollector(100, True)
        c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc)
        self.lucene.search(query=MatchAllDocsQuery(), collector=c)
        self.assertEquals(1, tc.topDocs(0).totalHits)

    def _addDocument(self, identifier, isformatof, sort=None):
        doc = Document()
        if isformatof:
            doc.add(NumericDocValuesField("__isformatof__", long(isformatof)))
        if sort:
            doc.add(NumericDocValuesField("__sort__", long(sort)))
        consume(self.lucene.addDocument(identifier, doc))
        self.lucene.commit()  # Explicitly, not required: since commitCount=1.

    def testCollectorFiltersTwoSimilar(self):
        self._addDocument("urn:1", 2, 1)
        self._addDocument("urn:2", 2, 2)
        tc = TopScoreDocSuperCollector(100, True)
        c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc)
        self.lucene.search(query=MatchAllDocsQuery(), collector=c)
        topDocsResult = tc.topDocs(0)
        self.assertEquals(1, topDocsResult.totalHits)
        self.assertEquals(1, len(topDocsResult.scoreDocs))

        docId = topDocsResult.scoreDocs[0].doc
        key = c.keyForDocId(docId)
        identifier = self.lucene._index.getDocument(key.getDocId()).get(IDFIELD)
        self.assertEquals('urn:2', identifier)
        self.assertEquals(2, key.count)

    def testCollectorFiltersTwoTimesTwoSimilarOneNot(self):
        self._addDocument("urn:1",  1, 2001)
        self._addDocument("urn:2",  3, 2009) # result 2x
        self._addDocument("urn:3", 50, 2010) # result 1x
        self._addDocument("urn:4",  3, 2001)
        self._addDocument("urn:5",  1, 2009) # result 2x
        #expected: "urn:2', "urn:3" and "urn:5" in no particular order
        tc = TopScoreDocSuperCollector(100, True)
        c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc)
        self.lucene.search(query=MatchAllDocsQuery(), collector=c)
        topDocsResult = tc.topDocs(0)
        self.assertEquals(3, topDocsResult.totalHits)
        self.assertEquals(3, len(topDocsResult.scoreDocs))
        rawDocIds = [scoreDoc.doc for scoreDoc in topDocsResult.scoreDocs]
        netDocIds = [c.keyForDocId(rawDocId).docId for rawDocId in rawDocIds]
        identifiers = set(self.lucene._index.getDocument(doc).get(IDFIELD) for doc in netDocIds)
        self.assertEquals(set(["urn:2", "urn:3", "urn:5"]), identifiers)
        self.assertEquals([1,2,2], list(sorted(c.keyForDocId(d).count for d in netDocIds)))

    def testSilentyYieldsWrongResultWhenFieldNameDoesNotMatch(self):
        self._addDocument("urn:1", 2)
        tc = TopScoreDocSuperCollector(100, True)
        c = DeDupFilterSuperCollector("__wrong_field__", "__sort__", tc)
        self.lucene.search(query=MatchAllDocsQuery(), collector=c)
        self.assertEquals(1, tc.topDocs(0).totalHits)

    def testShouldAddResultsWithoutIsFormatOf(self):
        self._addDocument("urn:1", 2)
        self._addDocument("urn:2", None)
        self._addDocument("urn:3", 2)
        self._addDocument("urn:4", None)
        self._addDocument("urn:5", None)
        self._addDocument("urn:6", None)
        self._addDocument("urn:7", None)
        self._addDocument("urn:8", None)
        self._addDocument("urn:9", None)
        self._addDocument("urn:A", None)
        self._addDocument("urn:B", None) # trigger a merge
        tc = TopScoreDocSuperCollector(100, True)
        c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc)
        self.lucene.search(query=MatchAllDocsQuery(), collector=c)
        self.assertEquals(10, tc.topDocs(0).totalHits)