class TermNumerator(Observable): def __init__(self, path, lruTaxonomyWriterCacheSize=100): Observable.__init__(self) taxoDirectory = MMapDirectory(File(path)) taxoDirectory.setUseUnmap(False) self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(lruTaxonomyWriterCacheSize)) def numerateTerm(self, term): if not term: return return self._taxoWriter.addCategory(FacetLabel([term])) def getTerm(self, nr): if not hasattr(self, "_taxoReader"): self._taxoReader = DirectoryTaxonomyReader(self._taxoWriter) tr = DirectoryTaxonomyReader.openIfChanged(self._taxoReader) if tr: self._taxoReader.close() self._taxoReader = tr return self._taxoReader.getPath(nr).components[0] def handleShutdown(self): print 'handle shutdown: saving TermNumerator' from sys import stdout; stdout.flush() self._taxoWriter.commit() def close(self): self._taxoWriter.close()
def __init__(self, path, settings): self._settings = settings self._multithreaded = settings.multithreaded self._checker = DirectSpellChecker() indexDirectory = MMapDirectory(File(join(path, 'index'))) indexDirectory.setUseUnmap(False) taxoDirectory = MMapDirectory(File(join(path, 'taxo'))) taxoDirectory.setUseUnmap(False) conf = IndexWriterConfig(Version.LUCENE_4_10_0, settings.analyzer) conf.setSimilarity(settings.similarity) mergePolicy = TieredMergePolicy() mergePolicy.setMaxMergeAtOnce(settings.maxMergeAtOnce) mergePolicy.setSegmentsPerTier(settings.segmentsPerTier) conf.setMergePolicy(mergePolicy) if not settings.readonly: self._indexWriter = IndexWriter(indexDirectory, conf) self._indexWriter.commit() self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(settings.lruTaxonomyWriterCacheSize)) self._taxoWriter.commit() self._indexAndTaxonomy = IndexAndTaxonomy(settings, indexDirectory, taxoDirectory) self._readerSettingsWrapper = self._indexAndTaxonomy._readerSettingsWrapper self._facetsConfig = settings.fieldRegistry.facetsConfig self._ordinalsReader = CachedOrdinalsReader(DocValuesOrdinalsReader())
def index (cls, indexDir, taxoDir): """Create an index, and adds to it sample documents and facets. indexDir Directory in which the index should be created. taxoDir Directory in which the taxonomy index should be created. """ # create and open an index writer from org.apache.lucene.util import Version config = IndexWriterConfig(Version.LUCENE_42, WhitespaceAnalyzer(Version.LUCENE_42)) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) # create and open a taxonomy writer taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE) # FacetFields is a utility class for adding facet fields to a document: facet_fields = FacetFields(taxo) # loop over sample documents nDocsAdded = 0 nFacetsAdded = 0 for docNum in range(len(docTexts)): # obtain the sample facets for current document facets = categories[docNum] facetList = [CategoryPath(f) for f in facets] # NOTE: setCategoryPaths() requires an Iterable, so need to convert the # Python list in order to to pass a proper argument to setCategoryPaths. # We use java.util.Arrays (via JCC) to create a Java List: facetList = Arrays.asList(facetList) # NOTE: we could use lucene.collections here as well in order to convert our # Python list to a Java based list using the JavaList class (JavaList implements # java.util.List around a Python list instance it wraps): # from lucene.collections import JavaList # facetList = JavaList(facetList) # create a plain Lucene document and add some regular Lucene fields to it doc = Document() doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES)) doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO)) # use the FacetFields utility class for adding facet fields (i.e. the categories) # to the document (and, as required, to the taxonomy index) facet_fields.addFields(doc, facetList) # finally add the document to the index iw.addDocument(doc) nDocsAdded +=1 nFacetsAdded += facetList.size() # end for # commit changes. # we commit changes to the taxonomy index prior to committing them to the search index. # this is important, so that all facets referred to by documents in the search index # will indeed exist in the taxonomy index. taxo.commit() iw.commit() # close the taxonomy index and the index - all modifications are # now safely in the provided directories: indexDir and taxoDir. taxo.close() iw.close() print "Indexed %d documents with overall %d facets." % (nDocsAdded,nFacetsAdded)
def index(cls, indexDir, taxoDir, facets_config): """Create an index, and adds to it sample documents and facets. indexDir Directory in which the index should be created. taxoDir Directory in which the taxonomy index should be created. """ # create and open an index writer config = IndexWriterConfig(Version.LUCENE_48, WhitespaceAnalyzer(Version.LUCENE_48)) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) # create and open a taxonomy writer taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE) # loop over sample documents nDocsAdded = 0 nFacetsAdded = 0 for docNum in range(len(docTexts)): # create a plain Lucene document and add some regular Lucene fields to it doc = Document() doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES)) doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO)) # obtain the sample facets for current document facets = categories[docNum] author = authors[docNum] # ... and use the FacetField class for adding facet fields to # the Lucene document (and via FacetsConfig to the taxonomy index) doc.add(FacetField("Author", author)) for f in facets: doc.add(FacetField("Categories", f)) # finally add the document to the index iw.addDocument(facets_config.build(taxo, doc)) nDocsAdded += 1 # close the taxonomy index and the index - all modifications are # now safely in the provided directories: indexDir and taxoDir. iw.close() taxo.close() print "Indexed %d documents with facets." % nDocsAdded
def index (cls, indexDir, taxoDir, facets_config): """Create an index, and adds to it sample documents and facets. indexDir Directory in which the index should be created. taxoDir Directory in which the taxonomy index should be created. """ # create and open an index writer config = IndexWriterConfig(Version.LUCENE_48, WhitespaceAnalyzer(Version.LUCENE_48)) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) # create and open a taxonomy writer taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE) # loop over sample documents nDocsAdded = 0 nFacetsAdded = 0 for docNum in range(len(docTexts)): # create a plain Lucene document and add some regular Lucene fields to it doc = Document() doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES)) doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO)) # obtain the sample facets for current document facets = categories[docNum] author = authors[docNum] # ... and use the FacetField class for adding facet fields to # the Lucene document (and via FacetsConfig to the taxonomy index) doc.add(FacetField("Author", [author])); for f in facets: doc.add(FacetField("Categories", f)) # finally add the document to the index iw.addDocument(facets_config.build(taxo, doc)); nDocsAdded += 1 # close the taxonomy index and the index - all modifications are # now safely in the provided directories: indexDir and taxoDir. iw.close() taxo.close() print "Indexed %d documents with facets." % nDocsAdded
def __init__(self, path, lruTaxonomyWriterCacheSize=100): Observable.__init__(self) taxoDirectory = MMapDirectory(File(path)) taxoDirectory.setUseUnmap(False) self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(lruTaxonomyWriterCacheSize))
class Index(object): def __init__(self, path, settings): self._settings = settings self._multithreaded = settings.multithreaded self._checker = DirectSpellChecker() indexDirectory = MMapDirectory(File(join(path, 'index'))) indexDirectory.setUseUnmap(False) taxoDirectory = MMapDirectory(File(join(path, 'taxo'))) taxoDirectory.setUseUnmap(False) conf = IndexWriterConfig(Version.LUCENE_4_10_0, settings.analyzer) conf.setSimilarity(settings.similarity) mergePolicy = TieredMergePolicy() mergePolicy.setMaxMergeAtOnce(settings.maxMergeAtOnce) mergePolicy.setSegmentsPerTier(settings.segmentsPerTier) conf.setMergePolicy(mergePolicy) if not settings.readonly: self._indexWriter = IndexWriter(indexDirectory, conf) self._indexWriter.commit() self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(settings.lruTaxonomyWriterCacheSize)) self._taxoWriter.commit() self._indexAndTaxonomy = IndexAndTaxonomy(settings, indexDirectory, taxoDirectory) self._readerSettingsWrapper = self._indexAndTaxonomy._readerSettingsWrapper self._facetsConfig = settings.fieldRegistry.facetsConfig self._ordinalsReader = CachedOrdinalsReader(DocValuesOrdinalsReader()) def addDocument(self, term, document): document = self._facetsConfig.build(self._taxoWriter, document) self._indexWriter.updateDocument(term, document) def deleteDocument(self, term): self._indexWriter.deleteDocuments(term) def search(self, query, filter, collector): self._indexAndTaxonomy.searcher.search(query, filter, collector) def suggest(self, query, count, field): suggestions = {} for token, startOffset, endOffset in self._analyzeToken(query): suggestWords = self._checker.suggestSimilar(Term(field, token), count, self._indexAndTaxonomy.searcher.getIndexReader()) if suggestWords: suggestions[token] = (startOffset, endOffset, [suggestWord.string for suggestWord in suggestWords]) return suggestions def termsForField(self, field, prefix=None, limit=10, **kwargs): convert = lambda term: term.utf8ToString() terms = [] termsEnum = MultiFields.getTerms(self._indexAndTaxonomy.searcher.getIndexReader(), field) if termsEnum is None: return terms iterator = termsEnum.iterator(None) if prefix: iterator.seekCeil(BytesRef(prefix)) terms.append((iterator.docFreq(), convert(iterator.term()))) bytesIterator = BytesRefIterator.cast_(iterator) try: while len(terms) < limit: term = convert(bytesIterator.next()) if prefix and not term.startswith(prefix): break terms.append((iterator.docFreq(), term)) except StopIteration: pass return terms def fieldnames(self): indexAndTaxonomy = self._indexAndTaxonomy fieldnames = [] fields = MultiFields.getFields(indexAndTaxonomy.searcher.getIndexReader()) if fields is None: return fieldnames iterator = fields.iterator() while iterator.hasNext(): fieldnames.append(iterator.next()) return fieldnames def drilldownFieldnames(self, path=None, limit=50): taxoReader = self._indexAndTaxonomy.taxoReader parentOrdinal = TaxonomyReader.ROOT_ORDINAL if path is None else taxoReader.getOrdinal(path[0], path[1:]) childrenIter = taxoReader.getChildren(parentOrdinal) names = [] while True: ordinal = childrenIter.next() if ordinal == TaxonomyReader.INVALID_ORDINAL: break names.append(taxoReader.getPath(ordinal).components[-1]) if len(names) >= limit: break return names def numDocs(self): return self._indexAndTaxonomy.searcher.getIndexReader().numDocs() def commit(self): if not self._settings.readonly: self._taxoWriter.commit() self._indexWriter.commit() self._indexAndTaxonomy.reopen() def getDocument(self, docId): return self._indexAndTaxonomy.searcher.doc(docId) def createFacetCollector(self): if not self._multithreaded: return FacetsCollector() return FacetSuperCollector(self._indexAndTaxonomy.taxoReader, self._facetsConfig, self._ordinalsReader) def facetResult(self, facetCollector): facetResult = TaxonomyFacetCounts(self._ordinalsReader, self._indexAndTaxonomy.taxoReader, self._facetsConfig, facetCollector) return Facets.cast_(facetResult) def close(self): self._indexAndTaxonomy.close() if not self._settings.readonly: self._taxoWriter.close() self._indexWriter.close() def _analyzeToken(self, token): result = [] reader = StringReader(unicode(token)) stda = self._settings.analyzer ts = stda.tokenStream("dummy field name", reader) termAtt = ts.addAttribute(CharTermAttribute.class_) offsetAtt = ts.addAttribute(OffsetAttribute.class_) try: ts.reset() while ts.incrementToken(): result.append((termAtt.toString(), offsetAtt.startOffset(), offsetAtt.endOffset())) ts.end() finally: ts.close() return result