def reindex(self): writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.corpus.analyzer, False, IndexWriter.MaxFieldLength.LIMITED) indexutils.reindex_all(self.reader, writer, self.corpus.analyzer) writer.optimize() writer.close() self.parent.write({'message': "Reindex successful. Corpus analyzer is now set to %s." % (self.corpus.analyzer_str,)}) self.parent.write({'status': "Ready!"})
def removeindex(self, data): writer = IndexWriter( self.d, self.conf) writer.deleteDocuments(lucene.Term("_id", data['record']['_id'])) writer.optimize() writer.close()
def updateindex(self, data): writer = IndexWriter( self.d, self.conf) doc = self.buildDocument(data['fields'], data['record']) writer.updateDocument(lucene.Term("_id", data['record']['_id']), doc) writer.optimize() writer.close()
def index(self): if not (os.path.exists(self._dataDir) and os.path.isdir(self._dataDir)): raise IOError, "%s isn't existed or is not a directory" % ( self._dataDir) dir = SimpleFSDirectory(Paths.get(self._indexDir)) writer = IndexWriter(dir, StandardAnalyzer(), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(False) self.indexDirectory(writer, self._dataDir) numIndexed = writer.numDocs() writer.optimize() writer.close() dir.close() return numIndexed
def xmlrpc_indexDocument(self, instance, id, text): """Index a new document.""" self.xmlrpc_unindexDocument(instance, id) # Create a document and add two fields to it. doc = Document() doc.add(Field('id', id, Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add(Field('text', text, Field.Store.YES, Field.Index.TOKENIZED)) doc.add(Field('instance', instance, Field.Store.YES, Field.Index.UN_TOKENIZED)) # Write the document into the index. writer = IndexWriter(self.indexPath, self.analyzer, 0) writer.addDocument(doc) writer.optimize() writer.close() log('Insert: Instance: %s Document: %s' %(instance, id)) return 1
def _init_index(self): if not os.path.exists(self.corpus.path): os.mkdir(self.corpus.path) try: searcher = IndexSearcher(SimpleFSDirectory(File(self.corpus.path)), True) #except lucene.JavaError: except: analyzer = self.corpus.analyzer writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), analyzer, True, IndexWriter.MaxFieldLength.LIMITED) writer.setMaxFieldLength(1048576) writer.optimize() writer.close() self.lucene_index = SimpleFSDirectory(File(self.corpus.path)) self.searcher = IndexSearcher(self.lucene_index, True) self.reader = IndexReader.open(self.lucene_index, True) self.analyzer = self.corpus.analyzer
class FileIndexer: __fileList = [] def __init__(self, dataDir, fileExtension, indexDir): os.path.walk(dataDir, self.__fileSearcher, fileExtension) self.__writer = IndexWriter(indexDir, StandardAnalyzer(), True) self.__writer.setUseCompoundFile(False) def __fileSearcher(self, fileExtension, dirname, filenames): #print "Directory:", dirname for filename in filenames: if filename.split('.')[-1] == fileExtension: self.__fileList.append(os.path.join(dirname, filename)) def getNames(self): return self.__fileList def indexFiles(self): for filename in self.__fileList: print filename print File(filename).getCanonicalPath() doc = Document() doc.add( Field("contents", open(filename, 'r').read(), Field.Store.YES, Field.Index.TOKENIZED)) doc.add( Field("path", File(filename).getCanonicalPath(), Field.Store.YES, Field.Index.UN_TOKENIZED)) self.__writer.addDocument(doc) def optimizeAndClose(self): docCount = self.__writer.docCount() self.__writer.optimize() self.__writer.close() return docCount
# update an existing index (adding a new document). from java.net import URL from java.util import Date # from de.nava.informa.impl.basic import ChannelBuilder # from de.nava.informa.utils import ChannelRegistry from de.nava.informa.impl.basic import Item from de.nava.informa.search import ItemDocument from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.index import IndexWriter # update (3rd arg) index writer in directory (first arg) start_time = Date() writer = IndexWriter("index", StandardAnalyzer(), 0) # create new (dummy) item item = Item("Informa released", "blubb", URL("http://nava.de/news/2002/06/25")) item.setFound(Date()) # add new item to index writer.addDocument(ItemDocument.makeDocument(item)) writer.optimize() writer.close() end_time = Date() print "updating the index took %d milliseconds in total." \ % (end_time.getTime() - start_time.getTime())
from java.net import URL from java.util import Date # from de.nava.informa.impl.basic import ChannelBuilder # from de.nava.informa.utils import ChannelRegistry from de.nava.informa.impl.basic import Item from de.nava.informa.search import ItemDocument from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.index import IndexWriter # update (3rd arg) index writer in directory (first arg) start_time = Date() writer = IndexWriter("index", StandardAnalyzer(), 0) # create new (dummy) item item = Item("Informa released", "blubb", URL("http://nava.de/news/2002/06/25")) item.setFound(Date()) # add new item to index writer.addDocument(ItemDocument.makeDocument(item)) writer.optimize() writer.close() end_time = Date() print "updating the index took %d milliseconds in total." \ % (end_time.getTime() - start_time.getTime())