def addDocuments(self, dir, isCompound): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(isCompound) # change to adjust performance of indexing with FSDirectory # writer.mergeFactor = writer.mergeFactor # writer.maxMergeDocs = writer.maxMergeDocs # writer.minMergeDocs = writer.minMergeDocs for word in self.docs: doc = Document() doc.add(Field("keyword", word, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO)) doc.add(Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def commitIndexWriter(self, writer): directory = writer.getDirectory() writer.close() dbDirectory = self.getDirectory() dbWriter = IndexWriter(dbDirectory, StandardAnalyzer(), False) dbWriter.setUseCompoundFile(False) dbWriter.addIndexes([directory]) directory.close() dbWriter.close() dbDirectory.close()
def index(cls, indexDir, dataDir): if not (os.path.exists(dataDir) and os.path.isdir(dataDir)): raise IOError, "%s does not exist or is not a directory" % (dataDir) writer = IndexWriter(indexDir, StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(False) numIndexed = cls.indexDirectory(writer, dataDir) writer.optimize() writer.close() return numIndexed
def createIndex(cls, dataDir, indexDir, useCompound): indexDir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter(indexDir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(useCompound) for dir, dirnames, filenames in os.walk(dataDir): for filename in filenames: if filename.endswith('.properties'): cls.indexFile(writer, os.path.join(dir, filename), dataDir) writer.optimize() writer.close()
def createIndex(cls, dataDir, indexDir, useCompound): indexDir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter( indexDir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.UNLIMITED ) writer.setUseCompoundFile(useCompound) for dir, dirnames, filenames in os.walk(dataDir): for filename in filenames: if filename.endswith(".properties"): cls.indexFile(writer, os.path.join(dir, filename), dataDir) writer.optimize() writer.close()
def index(cls, indexDir, dataDir): if not (os.path.exists(dataDir) and os.path.isdir(dataDir)): raise IOError, "%s does not exist or is not a directory" %(dataDir) dir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(False) cls.indexDirectory(writer, dataDir) numIndexed = writer.numDocs() writer.commit() writer.close() dir.close() return numIndexed
def index(cls, indexDir, dataDir): if not (os.path.exists(dataDir) and os.path.isdir(dataDir)): raise IOError, "%s does not exist or is not a directory" % ( dataDir) dir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(False) cls.indexDirectory(writer, dataDir) numIndexed = writer.numDocs() writer.optimize() writer.close() dir.close() return numIndexed
def addDocuments(self, dir): writer = IndexWriter(dir, self.getAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(self.isCompound()) for i in xrange(len(self.keywords)): doc = Document() doc.add(Field("id", self.keywords[i], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", self.unindexed[i], Field.Store.YES, Field.Index.NO)) doc.add(Field("contents", self.unstored[i], Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("city", self.text[i], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def addDocuments(self, dir): writer = IndexWriter(dir, self.getAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(self.isCompound()) for i in xrange(len(self.keywords)): doc = Document() doc.add( Field("id", self.keywords[i], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("country", self.unindexed[i], Field.Store.YES, Field.Index.NO)) doc.add( Field("contents", self.unstored[i], Field.Store.NO, Field.Index.ANALYZED)) doc.add( Field("city", self.text[i], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def addDocuments(self, dir, isCompound): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(isCompound) # change to adjust performance of indexing with FSDirectory # writer.mergeFactor = writer.mergeFactor # writer.maxMergeDocs = writer.maxMergeDocs # writer.minMergeDocs = writer.minMergeDocs for word in self.docs: doc = Document() doc.add( Field("keyword", word, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO)) doc.add( Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def main(cls, argv): if len(argv) < 2: print "Usage: BerkeleyDbIndexer <index dir> -create" return dbHome = argv[1] create = len(argv) > 2 and argv[2] == "-create" if not os.path.exists(dbHome): os.makedirs(dbHome) elif create: for name in os.listdir(dbHome): if name.startswith('__'): os.remove(os.path.join(dbHome, name)) env = DBEnv() env.set_flags(DB_LOG_INMEMORY, 1); if os.name == 'nt': env.set_cachesize(0, 0x4000000, 1) elif os.name == 'posix': from commands import getstatusoutput if getstatusoutput('uname') == (0, 'Linux'): env.set_cachesize(0, 0x4000000, 1) env.open(dbHome, (DB_CREATE | DB_THREAD | DB_INIT_MPOOL | DB_INIT_LOCK | DB_INIT_TXN), 0) index = DB(env) blocks = DB(env) txn = None try: txn = env.txn_begin(None) index.open(filename = '__index__', dbtype = DB_BTREE, flags = DB_CREATE | DB_THREAD, txn = txn) blocks.open(filename = '__blocks__', dbtype = DB_BTREE, flags = DB_CREATE | DB_THREAD, txn = txn) except: if txn is not None: txn.abort() txn = None raise else: txn.commit() txn = None try: txn = env.txn_begin(None) directory = DbDirectory(txn, index, blocks, 0) writer = IndexWriter(directory, StandardAnalyzer(), create, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(False) doc = Document() doc.add(Field("contents", "The quick brown fox...", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.commit() writer.close() except: if txn is not None: txn.abort() txn = None raise else: txn.commit() index.close() blocks.close() env.close() print "Indexing Complete"
def main(cls, argv): if len(argv) < 2: print "Usage: BerkeleyDbIndexer <index dir> -create" return dbHome = argv[1] create = len(argv) > 2 and argv[2] == "-create" if not os.path.exists(dbHome): os.makedirs(dbHome) elif create: for name in os.listdir(dbHome): if name.startswith('__'): os.remove(os.path.join(dbHome, name)) env = DBEnv() env.set_flags(DB_LOG_INMEMORY, 1) if os.name == 'nt': env.set_cachesize(0, 0x4000000, 1) elif os.name == 'posix': from commands import getstatusoutput if getstatusoutput('uname') == (0, 'Linux'): env.set_cachesize(0, 0x4000000, 1) env.open(dbHome, (DB_CREATE | DB_THREAD | DB_INIT_MPOOL | DB_INIT_LOCK | DB_INIT_TXN), 0) index = DB(env) blocks = DB(env) txn = None try: txn = env.txn_begin(None) index.open(filename='__index__', dbtype=DB_BTREE, flags=DB_CREATE | DB_THREAD, txn=txn) blocks.open(filename='__blocks__', dbtype=DB_BTREE, flags=DB_CREATE | DB_THREAD, txn=txn) except: if txn is not None: txn.abort() txn = None raise else: txn.commit() txn = None try: txn = env.txn_begin(None) directory = DbDirectory(txn, index, blocks, 0) writer = IndexWriter(directory, StandardAnalyzer(), create, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(False) doc = Document() doc.add( Field("contents", "The quick brown fox...", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close() except: if txn is not None: txn.abort() txn = None raise else: txn.commit() index.close() blocks.close() env.close() print "Indexing Complete"
def getIndexWriter(self): writer = IndexWriter(RAMDirectory(), StandardAnalyzer(), True) writer.setUseCompoundFile(False) return writer