Exemple #1
0
    def testFacetAndTopsMultiCollector(self):
        I = Index(path=self.tempdir, settings=LuceneSettings())
        for i in xrange(99):
            document1 = createDocument(fields=[("field1", str(i)),
                                               ("field2", str(i) * 1000)],
                                       facets=[("facet1", "value%s" % (i % 10))
                                               ])
            document1 = I._facetsConfig.build(I._taxoWriter, document1)
            I._indexWriter.addDocument(document1)
        I.commit()
        I.close()
        I = Index(path=self.tempdir, settings=LuceneSettings())

        f = FacetSuperCollector(I._indexAndTaxonomy.taxoReader,
                                I._facetsConfig, I._ordinalsReader)
        t = TopScoreDocSuperCollector(10, True)
        collectors = ArrayList().of_(SuperCollector)
        collectors.add(t)
        collectors.add(f)
        C = MultiSuperCollector(collectors)
        Q = MatchAllDocsQuery()
        I.search(Q, None, C)

        self.assertEquals(99, t.topDocs(0).totalHits)
        self.assertEquals(10, len(t.topDocs(0).scoreDocs))
        tc = f.getTopChildren(10, "facet1", [])

        self.assertEquals([('value0', 10), ('value1', 10), ('value2', 10),
                           ('value3', 10), ('value4', 10), ('value5', 10),
                           ('value6', 10), ('value7', 10), ('value8', 10),
                           ('value9', 9)], [(l.label, l.value.intValue())
                                            for l in tc.labelValues])
Exemple #2
0
 def testSearchTopField(self):
     I = Index(path=self.tempdir, settings=LuceneSettings())
     I._indexWriter.addDocument(
         document(__id__='1', name="one", price="aap noot mies"))
     I.commit()
     I._indexWriter.addDocument(
         document(__id__='2', name="two", price="aap vuur boom"))
     I.commit()
     I._indexWriter.addDocument(
         document(__id__='3', name="three", price="noot boom mies"))
     I.commit()
     I.close()
     I = Index(path=self.tempdir, settings=LuceneSettings())
     sort = Sort(SortField("name", SortField.Type.STRING, True))
     C = TopFieldSuperCollector(sort, 2, True, False, True)
     Q = MatchAllDocsQuery()
     I.search(Q, None, C)
     td = C.topDocs(0)
     self.assertEquals(3, C.getTotalHits())
     self.assertEquals(3, td.totalHits)
     self.assertEquals(2, len(td.scoreDocs))
     self.assertEquals(
         ['2', '3'],
         [I.getDocument(s.doc).get("__id__") for s in td.scoreDocs])
Exemple #3
0
class Lucene(object):
    COUNT = 'count'
    SUPPORTED_SORTBY_VALUES = [COUNT]

    def __init__(self, path, reactor, settings, name=None, **kwargs):
        self._reactor = reactor
        self.settings = settings
        self._multithreaded = settings.multithreaded
        self._fieldRegistry = settings.fieldRegistry
        self._commitCount = 0
        self._commitTimerToken = None
        self._index = Index(path, settings=settings, **kwargs)
        self.readerSettingsWrapper = self._index._readerSettingsWrapper
        if name is not None:
            self.observable_name = lambda: name
        self.coreName = name or basename(path)
        self._filterCache = LruCache(
            keyEqualsFunction=lambda q1, q2: q1.equals(q2),
            createFunction=lambda q: CachingWrapperFilter(QueryWrapperFilter(q))
        )
        self._scoreCollectorCache = LruCache(
            keyEqualsFunction=lambda (k, q), (k2, q2): k == k2 and q.equals(q2),
            createFunction=lambda args: self._scoreCollector(*args)
        )
        self._collectedKeysCache = LruCache(
            keyEqualsFunction=lambda (f, k), (f1, k1): k == k1 and f.equals(f1),
            createFunction=lambda (filter, keyName): self._collectKeys(filter=filter, keyName=keyName, query=None)
        )
        if settings.readonly:
            self._startCommitTimer()
        self.log = self._log if settings.verbose else lambda v: None

    def addDocument(self, identifier, document):
        document.add(self._fieldRegistry.createIdField(identifier))
        self._index.addDocument(term=Term(IDFIELD, identifier), document=document)
        self.commit()
        return
        yield

    def delete(self, identifier):
        self._index.deleteDocument(Term(IDFIELD, identifier))
        self.commit()
        return
        yield

    def _startCommitTimer(self):
        self._commitTimerToken = self._reactor.addTimer(
                seconds=self.settings.commitTimeout,
                callback=lambda: self._realCommit(removeTimer=False)
            )

    def commit(self):
        self._commitCount += 1
        if self._commitTimerToken is None:
            self._startCommitTimer()
        if self._commitCount >= self.settings.commitCount:
            self._realCommit()
            self._commitCount = 0

    def _realCommit(self, removeTimer=True):
        t0 = time()
        self._commitTimerToken, token = None, self._commitTimerToken
        if removeTimer:
            self._reactor.removeTimer(token=token)
        self._index.commit()
        self._scoreCollectorCache.clear()
        self._collectedKeysCache.clear()
        if self.settings.readonly:
            self._startCommitTimer()
        self.log("Lucene {0}: commit took: {1:.2f} seconds".format(self.coreName, time() - t0))

    def search(self, query=None, filterQuery=None, collector=None):
        filter_ = None
        if filterQuery:
            filter_ = QueryWrapperFilter(filterQuery)
        self._index.search(query, filter_, collector)

    def facets(self, facets, filterQueries, drilldownQueries=None, filter=None):
        facetCollector = self._facetCollector() if facets else None
        filter_ = self._filterFor(filterQueries, filter=filter)
        query = MatchAllDocsQuery()
        if drilldownQueries:
            query = self.createDrilldownQuery(query, drilldownQueries)
        self._index.search(query, filter_, facetCollector)
        generatorReturn(self._facetResult(facetCollector, facets))
        yield

    def executeQuery(self, luceneQuery, start=None, stop=None, sortKeys=None, facets=None,
            filterQueries=None, suggestionRequest=None, filter=None, dedupField=None, dedupSortField=None, scoreCollector=None, drilldownQueries=None, keyCollector=None, **kwargs):
        t0 = time()
        stop = 10 if stop is None else stop
        start = 0 if start is None else start

        collectors = []
        resultsCollector = topCollector = self._topCollector(start=start, stop=stop, sortKeys=sortKeys)
        dedupCollector = None
        if dedupField:
            constructor = DeDupFilterSuperCollector if self._multithreaded else DeDupFilterCollector
            resultsCollector = dedupCollector = constructor(dedupField, dedupSortField, topCollector)
        collectors.append(resultsCollector)

        if facets:
            facetCollector = self._facetCollector()
            collectors.append(facetCollector)
        if keyCollector:
            collectors.append(keyCollector)

        if self._multithreaded:
            multiSubCollectors = ArrayList().of_(SuperCollector)
            for c in collectors:
                multiSubCollectors.add(c)
        collector = MultiSuperCollector(multiSubCollectors) if self._multithreaded else MultiCollector.wrap(collectors)

        if scoreCollector:
            scoreCollector.setDelegate(collector)
            collector = scoreCollector

        filter_ = self._filterFor(filterQueries, filter)

        if drilldownQueries:
            luceneQuery = self.createDrilldownQuery(luceneQuery, drilldownQueries)
        self._index.search(luceneQuery, filter_, collector)

        total, hits = self._topDocsResponse(topCollector, start=start, dedupCollector=dedupCollector if dedupField else None)

        response = LuceneResponse(total=total, hits=hits, drilldownData=[])

        if dedupCollector:
            response.totalWithDuplicates = dedupCollector.totalHits

        if facets:
            response.drilldownData.extend(self._facetResult(facetCollector, facets))

        if suggestionRequest:
            response.suggestions = self._index.suggest(**suggestionRequest)

        response.queryTime = millis(time() - t0)

        raise StopIteration(response)
        yield

    def createDrilldownQuery(self, luceneQuery, drilldownQueries):
        q = BooleanQuery(True)
        if luceneQuery:
            q.add(luceneQuery, BooleanClause.Occur.MUST)
        for field, path in drilldownQueries:
            q.add(TermQuery(self._fieldRegistry.makeDrilldownTerm(field, path)), BooleanClause.Occur.MUST);
        return q

    def prefixSearch(self, fieldname, prefix, showCount=False, **kwargs):
        t0 = time()
        terms = self._index.termsForField(fieldname, prefix=prefix, **kwargs)
        hits = [((term, count) if showCount else term) for count, term in sorted(terms, reverse=True)]
        response = LuceneResponse(total=len(terms), hits=hits, queryTime=millis(time() - t0))
        raise StopIteration(response)
        yield

    def fieldnames(self, **kwargs):
        fieldnames = self._index.fieldnames()
        response = LuceneResponse(total=len(fieldnames), hits=fieldnames)
        raise StopIteration(response)
        yield

    def drilldownFieldnames(self, *args, **kwargs):
        drilldownFieldnames = self._index.drilldownFieldnames(*args, **kwargs)
        response = LuceneResponse(total=len(drilldownFieldnames), hits=drilldownFieldnames)
        raise StopIteration(response)
        yield

    def scoreCollector(self, keyName, query):
        return self._scoreCollectorCache.get((keyName, query))

    def _scoreCollector(self, keyName, query):
        scoreCollector = ScoreSuperCollector(keyName) if self._multithreaded else ScoreCollector(keyName)
        self.search(query=query, collector=scoreCollector)
        return scoreCollector

    def collectKeys(self, filter, keyName, query=None, cacheCollectedKeys=True):
        assert not (query is not None and cacheCollectedKeys), "Caching of collecting keys with queries is not allowed"
        if cacheCollectedKeys:
            return self._collectedKeysCache.get((filter, keyName))
        else:
            return self._collectKeys(filter, keyName, query=query)

    def _collectKeys(self, filter, keyName, query):
        keyCollector = KeySuperCollector(keyName) if self._multithreaded else KeyCollector(keyName)
        self.search(query=query or MatchAllDocsQuery(), filterQuery=filter, collector=keyCollector)
        return keyCollector.getCollectedKeys()

    def close(self):
        if self._commitTimerToken is not None:
            self._reactor.removeTimer(self._commitTimerToken)
        self._index.close()

    def handleShutdown(self):
        print "handle shutdown: saving Lucene core '%s'" % self.coreName
        from sys import stdout; stdout.flush()
        self.close()

    def _topDocsResponse(self, collector, start, dedupCollector=None):
        # TODO: Probably use FieldCache iso document.get()
        hits = []
        dedupCollectorFieldName = dedupCollector.getKeyName() if dedupCollector else None
        if hasattr(collector, "topDocs"):
            for scoreDoc in collector.topDocs(start).scoreDocs:
                if dedupCollector:
                    keyForDocId = dedupCollector.keyForDocId(scoreDoc.doc)
                    newDocId = keyForDocId.getDocId() if keyForDocId else scoreDoc.doc
                    hit = Hit(self._index.getDocument(newDocId).get(IDFIELD))
                    hit.duplicateCount = {dedupCollectorFieldName: keyForDocId.getCount() if keyForDocId else 0}
                else:
                    hit = Hit(self._index.getDocument(scoreDoc.doc).get(IDFIELD))
                hit.score = scoreDoc.score
                hits.append(hit)
        return collector.getTotalHits(), hits

    def _filterFor(self, filterQueries, filter=None):
        if not filterQueries:
            return filter
        filters = [self._filterCache.get(f) for f in filterQueries]
        if filter is not None:
            filters.append(filter)
        return chainFilters(filters, ChainedFilter.AND)

    def _facetResult(self, facetCollector, facets):
        facetResult = facetCollector
        if not self._multithreaded:
            facetResult = self._index.facetResult(facetCollector)
        result = []
        for f in facets:
            sortBy = f.get('sortBy')
            if not (sortBy is None or sortBy in self.SUPPORTED_SORTBY_VALUES):
                raise ValueError('Value of "sortBy" should be in %s' % self.SUPPORTED_SORTBY_VALUES)
            path = f.get('path', [])
            result.append(dict(
                    fieldname=f['fieldname'],
                    path=path,
                    terms=_termsFromFacetResult(
                            facetResult=facetResult,
                            facet=f,
                            path=path
                        )))
        return result

    def _facetCollector(self):
        return self._index.createFacetCollector()

    def coreInfo(self):
        yield self.LuceneInfo(self)

    class LuceneInfo(object):
        def __init__(inner, self):
            inner._lucene = self
            inner.name = self.coreName
            inner.numDocs = self._index.numDocs

    def _topCollector(self, start, stop, sortKeys):
        if stop <= start:
            return TotalHitCountSuperCollector() if self._multithreaded else TotalHitCountCollector()
        # fillFields = False # always true for multi-threading/sharding
        trackDocScores = True
        trackMaxScore = False
        docsScoredInOrder = True
        if sortKeys:
            sortFields = [
                self._sortField(fieldname=sortKey['sortBy'], sortDescending=sortKey['sortDescending'])
                for sortKey in sortKeys
            ]
            sort = Sort(sortFields)
        else:
            return TopScoreDocSuperCollector(stop, docsScoredInOrder) if self._multithreaded else TopScoreDocCollector.create(stop, docsScoredInOrder)
        if self._multithreaded:
            return TopFieldSuperCollector(sort, stop, trackDocScores, trackMaxScore, docsScoredInOrder)
        else:
            fillFields = False
            return TopFieldCollector.create(sort, stop, fillFields, trackDocScores, trackMaxScore, docsScoredInOrder)

    def _sortField(self, fieldname, sortDescending):
        result = SortField(fieldname, SortField.Type.STRING, sortDescending)
        result.setMissingValue(SortField.STRING_FIRST if sortDescending else SortField.STRING_LAST)
        return result

    def _log(self, value):
        print value
        from sys import stdout; stdout.flush()