def testFacetAndTopsMultiCollector(self): I = Index(path=self.tempdir, settings=LuceneSettings()) for i in xrange(99): document1 = createDocument(fields=[("field1", str(i)), ("field2", str(i) * 1000)], facets=[("facet1", "value%s" % (i % 10)) ]) document1 = I._facetsConfig.build(I._taxoWriter, document1) I._indexWriter.addDocument(document1) I.commit() I.close() I = Index(path=self.tempdir, settings=LuceneSettings()) f = FacetSuperCollector(I._indexAndTaxonomy.taxoReader, I._facetsConfig, I._ordinalsReader) t = TopScoreDocSuperCollector(10, True) collectors = ArrayList().of_(SuperCollector) collectors.add(t) collectors.add(f) C = MultiSuperCollector(collectors) Q = MatchAllDocsQuery() I.search(Q, None, C) self.assertEquals(99, t.topDocs(0).totalHits) self.assertEquals(10, len(t.topDocs(0).scoreDocs)) tc = f.getTopChildren(10, "facet1", []) self.assertEquals([('value0', 10), ('value1', 10), ('value2', 10), ('value3', 10), ('value4', 10), ('value5', 10), ('value6', 10), ('value7', 10), ('value8', 10), ('value9', 9)], [(l.label, l.value.intValue()) for l in tc.labelValues])
def testSearchTopField(self): I = Index(path=self.tempdir, settings=LuceneSettings()) I._indexWriter.addDocument( document(__id__='1', name="one", price="aap noot mies")) I.commit() I._indexWriter.addDocument( document(__id__='2', name="two", price="aap vuur boom")) I.commit() I._indexWriter.addDocument( document(__id__='3', name="three", price="noot boom mies")) I.commit() I.close() I = Index(path=self.tempdir, settings=LuceneSettings()) sort = Sort(SortField("name", SortField.Type.STRING, True)) C = TopFieldSuperCollector(sort, 2, True, False, True) Q = MatchAllDocsQuery() I.search(Q, None, C) td = C.topDocs(0) self.assertEquals(3, C.getTotalHits()) self.assertEquals(3, td.totalHits) self.assertEquals(2, len(td.scoreDocs)) self.assertEquals( ['2', '3'], [I.getDocument(s.doc).get("__id__") for s in td.scoreDocs])
class Lucene(object): COUNT = 'count' SUPPORTED_SORTBY_VALUES = [COUNT] def __init__(self, path, reactor, settings, name=None, **kwargs): self._reactor = reactor self.settings = settings self._multithreaded = settings.multithreaded self._fieldRegistry = settings.fieldRegistry self._commitCount = 0 self._commitTimerToken = None self._index = Index(path, settings=settings, **kwargs) self.readerSettingsWrapper = self._index._readerSettingsWrapper if name is not None: self.observable_name = lambda: name self.coreName = name or basename(path) self._filterCache = LruCache( keyEqualsFunction=lambda q1, q2: q1.equals(q2), createFunction=lambda q: CachingWrapperFilter(QueryWrapperFilter(q)) ) self._scoreCollectorCache = LruCache( keyEqualsFunction=lambda (k, q), (k2, q2): k == k2 and q.equals(q2), createFunction=lambda args: self._scoreCollector(*args) ) self._collectedKeysCache = LruCache( keyEqualsFunction=lambda (f, k), (f1, k1): k == k1 and f.equals(f1), createFunction=lambda (filter, keyName): self._collectKeys(filter=filter, keyName=keyName, query=None) ) if settings.readonly: self._startCommitTimer() self.log = self._log if settings.verbose else lambda v: None def addDocument(self, identifier, document): document.add(self._fieldRegistry.createIdField(identifier)) self._index.addDocument(term=Term(IDFIELD, identifier), document=document) self.commit() return yield def delete(self, identifier): self._index.deleteDocument(Term(IDFIELD, identifier)) self.commit() return yield def _startCommitTimer(self): self._commitTimerToken = self._reactor.addTimer( seconds=self.settings.commitTimeout, callback=lambda: self._realCommit(removeTimer=False) ) def commit(self): self._commitCount += 1 if self._commitTimerToken is None: self._startCommitTimer() if self._commitCount >= self.settings.commitCount: self._realCommit() self._commitCount = 0 def _realCommit(self, removeTimer=True): t0 = time() self._commitTimerToken, token = None, self._commitTimerToken if removeTimer: self._reactor.removeTimer(token=token) self._index.commit() self._scoreCollectorCache.clear() self._collectedKeysCache.clear() if self.settings.readonly: self._startCommitTimer() self.log("Lucene {0}: commit took: {1:.2f} seconds".format(self.coreName, time() - t0)) def search(self, query=None, filterQuery=None, collector=None): filter_ = None if filterQuery: filter_ = QueryWrapperFilter(filterQuery) self._index.search(query, filter_, collector) def facets(self, facets, filterQueries, drilldownQueries=None, filter=None): facetCollector = self._facetCollector() if facets else None filter_ = self._filterFor(filterQueries, filter=filter) query = MatchAllDocsQuery() if drilldownQueries: query = self.createDrilldownQuery(query, drilldownQueries) self._index.search(query, filter_, facetCollector) generatorReturn(self._facetResult(facetCollector, facets)) yield def executeQuery(self, luceneQuery, start=None, stop=None, sortKeys=None, facets=None, filterQueries=None, suggestionRequest=None, filter=None, dedupField=None, dedupSortField=None, scoreCollector=None, drilldownQueries=None, keyCollector=None, **kwargs): t0 = time() stop = 10 if stop is None else stop start = 0 if start is None else start collectors = [] resultsCollector = topCollector = self._topCollector(start=start, stop=stop, sortKeys=sortKeys) dedupCollector = None if dedupField: constructor = DeDupFilterSuperCollector if self._multithreaded else DeDupFilterCollector resultsCollector = dedupCollector = constructor(dedupField, dedupSortField, topCollector) collectors.append(resultsCollector) if facets: facetCollector = self._facetCollector() collectors.append(facetCollector) if keyCollector: collectors.append(keyCollector) if self._multithreaded: multiSubCollectors = ArrayList().of_(SuperCollector) for c in collectors: multiSubCollectors.add(c) collector = MultiSuperCollector(multiSubCollectors) if self._multithreaded else MultiCollector.wrap(collectors) if scoreCollector: scoreCollector.setDelegate(collector) collector = scoreCollector filter_ = self._filterFor(filterQueries, filter) if drilldownQueries: luceneQuery = self.createDrilldownQuery(luceneQuery, drilldownQueries) self._index.search(luceneQuery, filter_, collector) total, hits = self._topDocsResponse(topCollector, start=start, dedupCollector=dedupCollector if dedupField else None) response = LuceneResponse(total=total, hits=hits, drilldownData=[]) if dedupCollector: response.totalWithDuplicates = dedupCollector.totalHits if facets: response.drilldownData.extend(self._facetResult(facetCollector, facets)) if suggestionRequest: response.suggestions = self._index.suggest(**suggestionRequest) response.queryTime = millis(time() - t0) raise StopIteration(response) yield def createDrilldownQuery(self, luceneQuery, drilldownQueries): q = BooleanQuery(True) if luceneQuery: q.add(luceneQuery, BooleanClause.Occur.MUST) for field, path in drilldownQueries: q.add(TermQuery(self._fieldRegistry.makeDrilldownTerm(field, path)), BooleanClause.Occur.MUST); return q def prefixSearch(self, fieldname, prefix, showCount=False, **kwargs): t0 = time() terms = self._index.termsForField(fieldname, prefix=prefix, **kwargs) hits = [((term, count) if showCount else term) for count, term in sorted(terms, reverse=True)] response = LuceneResponse(total=len(terms), hits=hits, queryTime=millis(time() - t0)) raise StopIteration(response) yield def fieldnames(self, **kwargs): fieldnames = self._index.fieldnames() response = LuceneResponse(total=len(fieldnames), hits=fieldnames) raise StopIteration(response) yield def drilldownFieldnames(self, *args, **kwargs): drilldownFieldnames = self._index.drilldownFieldnames(*args, **kwargs) response = LuceneResponse(total=len(drilldownFieldnames), hits=drilldownFieldnames) raise StopIteration(response) yield def scoreCollector(self, keyName, query): return self._scoreCollectorCache.get((keyName, query)) def _scoreCollector(self, keyName, query): scoreCollector = ScoreSuperCollector(keyName) if self._multithreaded else ScoreCollector(keyName) self.search(query=query, collector=scoreCollector) return scoreCollector def collectKeys(self, filter, keyName, query=None, cacheCollectedKeys=True): assert not (query is not None and cacheCollectedKeys), "Caching of collecting keys with queries is not allowed" if cacheCollectedKeys: return self._collectedKeysCache.get((filter, keyName)) else: return self._collectKeys(filter, keyName, query=query) def _collectKeys(self, filter, keyName, query): keyCollector = KeySuperCollector(keyName) if self._multithreaded else KeyCollector(keyName) self.search(query=query or MatchAllDocsQuery(), filterQuery=filter, collector=keyCollector) return keyCollector.getCollectedKeys() def close(self): if self._commitTimerToken is not None: self._reactor.removeTimer(self._commitTimerToken) self._index.close() def handleShutdown(self): print "handle shutdown: saving Lucene core '%s'" % self.coreName from sys import stdout; stdout.flush() self.close() def _topDocsResponse(self, collector, start, dedupCollector=None): # TODO: Probably use FieldCache iso document.get() hits = [] dedupCollectorFieldName = dedupCollector.getKeyName() if dedupCollector else None if hasattr(collector, "topDocs"): for scoreDoc in collector.topDocs(start).scoreDocs: if dedupCollector: keyForDocId = dedupCollector.keyForDocId(scoreDoc.doc) newDocId = keyForDocId.getDocId() if keyForDocId else scoreDoc.doc hit = Hit(self._index.getDocument(newDocId).get(IDFIELD)) hit.duplicateCount = {dedupCollectorFieldName: keyForDocId.getCount() if keyForDocId else 0} else: hit = Hit(self._index.getDocument(scoreDoc.doc).get(IDFIELD)) hit.score = scoreDoc.score hits.append(hit) return collector.getTotalHits(), hits def _filterFor(self, filterQueries, filter=None): if not filterQueries: return filter filters = [self._filterCache.get(f) for f in filterQueries] if filter is not None: filters.append(filter) return chainFilters(filters, ChainedFilter.AND) def _facetResult(self, facetCollector, facets): facetResult = facetCollector if not self._multithreaded: facetResult = self._index.facetResult(facetCollector) result = [] for f in facets: sortBy = f.get('sortBy') if not (sortBy is None or sortBy in self.SUPPORTED_SORTBY_VALUES): raise ValueError('Value of "sortBy" should be in %s' % self.SUPPORTED_SORTBY_VALUES) path = f.get('path', []) result.append(dict( fieldname=f['fieldname'], path=path, terms=_termsFromFacetResult( facetResult=facetResult, facet=f, path=path ))) return result def _facetCollector(self): return self._index.createFacetCollector() def coreInfo(self): yield self.LuceneInfo(self) class LuceneInfo(object): def __init__(inner, self): inner._lucene = self inner.name = self.coreName inner.numDocs = self._index.numDocs def _topCollector(self, start, stop, sortKeys): if stop <= start: return TotalHitCountSuperCollector() if self._multithreaded else TotalHitCountCollector() # fillFields = False # always true for multi-threading/sharding trackDocScores = True trackMaxScore = False docsScoredInOrder = True if sortKeys: sortFields = [ self._sortField(fieldname=sortKey['sortBy'], sortDescending=sortKey['sortDescending']) for sortKey in sortKeys ] sort = Sort(sortFields) else: return TopScoreDocSuperCollector(stop, docsScoredInOrder) if self._multithreaded else TopScoreDocCollector.create(stop, docsScoredInOrder) if self._multithreaded: return TopFieldSuperCollector(sort, stop, trackDocScores, trackMaxScore, docsScoredInOrder) else: fillFields = False return TopFieldCollector.create(sort, stop, fillFields, trackDocScores, trackMaxScore, docsScoredInOrder) def _sortField(self, fieldname, sortDescending): result = SortField(fieldname, SortField.Type.STRING, sortDescending) result.setMissingValue(SortField.STRING_FIRST if sortDescending else SortField.STRING_LAST) return result def _log(self, value): print value from sys import stdout; stdout.flush()