def addDocuments(self, dir): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) # # change to adjust performance of indexing with FSDirectory # writer.mergeFactor = writer.mergeFactor # writer.maxMergeDocs = writer.maxMergeDocs # writer.minMergeDocs = writer.minMergeDocs # for word in self.docs: doc = Document() doc.add(Field("keyword", word, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO)) doc.add(Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.commit() writer.close()
def indexSingleFieldDocs(self, fields): writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) for field in fields: doc = Document() doc.add(field) writer.addDocument(doc) writer.commit() writer.close()
def index(cls, indexDir, dataDir): if not (os.path.exists(dataDir) and os.path.isdir(dataDir)): raise IOError, "%s does not exist or is not a directory" %(dataDir) writer = IndexWriter(indexDir, StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(False) numIndexed = cls.indexDirectory(writer, dataDir) writer.commit() writer.close() return numIndexed
def index(self): dirPath = os.path.join(System.getProperty("java.io.tmpdir", "tmp"), "verbose-index") dir = FSDirectory.getDirectory(dirPath, True) writer = IndexWriter(dir, SimpleAnalyzer(), True) writer.setInfoStream(System.out) for i in xrange(100): doc = Document() doc.add(Field("keyword", "goober", Field.Store.YES, Field.Index.UN_TOKENIZED)) writer.addDocument(doc) writer.commit() writer.close()
def index(cls, indexDir, dataDir): if not (os.path.exists(dataDir) and os.path.isdir(dataDir)): raise IOError, "%s does not exist or is not a directory" %(dataDir) dir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(False) cls.indexDirectory(writer, dataDir) numIndexed = writer.numDocs() writer.commit() writer.close() dir.close() return numIndexed
def main(cls, argv): if len(argv) != 3: print "Usage: T9er <WordNet index dir> <t9 index>" return for key in cls.keys: c = key[0] k = key[1:] for kc in k: cls.keyMap[kc] = c print kc, "=", c indexDir = argv[1] t9dir = argv[2] reader = IndexReader.open(indexDir) numDocs = reader.maxDoc() print "Processing", numDocs, "words" writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True) for id in xrange(reader.maxDoc()): origDoc = reader.document(id) word = origDoc.get("word") if word is None or len(word) == 0: continue newDoc = Document() newDoc.add(Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add(Field("t9", cls.t9(word), Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add(Field("length", str(len(word)), Field.Store.NO, Field.Index.UN_TOKENIZED)) writer.addDocument(newDoc) if id % 100 == 0: print "Document", id writer.commit() writer.close() reader.close()
def __init__(self, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(writer) ticker = Ticker() print( 'optimizing index' ), threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print( 'done' )
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) ticker = Ticker() print('optimizing index'), threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print('done')
def addDocuments(self, dir): writer = IndexWriter(dir, self.getAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(self.isCompound()) for i in xrange(len(self.keywords)): doc = Document() doc.add(Field("id", self.keywords[i], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", self.unindexed[i], Field.Store.YES, Field.Index.NO)) doc.add(Field("contents", self.unstored[i], Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("city", self.text[i], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.commit() writer.close()
def createIndex(cls, dataDir, indexDir, useCompound): indexDir = SimpleFSDirectory(File(indexDir)) config = IndexWriterConfig(Version.LUCENE_CURRENT, StandardAnalyzer(Version.LUCENE_CURRENT)) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(indexDir, config) config = writer.getConfig() mp = config.getMergePolicy() if (LogMergePolicy.instance_(mp)): mp.setUseCompoundFile(useCompound) for dir, dirnames, filenames in os.walk(dataDir): for filename in filenames: if filename.endswith('.properties'): cls.indexFile(writer, os.path.join(dir, filename), dataDir) writer.commit() writer.close()
def testDeleteAfterIndexMerge(self): reader = IndexReader.open(self.dir, False) self.assertEqual(2, reader.maxDoc()) self.assertEqual(2, reader.numDocs()) reader.deleteDocument(1) reader.close() writer = IndexWriter(self.dir, self.getAnalyzer(), False, IndexWriter.MaxFieldLength.UNLIMITED) writer.commit() writer.close() reader = IndexReader.open(self.dir, True) deletedDocs = MultiFields.getDeletedDocs(reader) self.assert_(deletedDocs is None or not deletedDocs.get(1)) self.assert_(not reader.hasDeletions()) self.assertEqual(1, reader.maxDoc()) self.assertEqual(1, reader.numDocs()) reader.close()
def indexTaxonomy(taxonomy, index_path): lucene.initVM() index_location = index_path dir = SimpleFSDirectory(lucene.File(index_location)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(1024)) for i in taxonomy: v = taxonomy[i] doc = lucene.Document() doc.add(lucene.Field('name', v['name'] , lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add(lucene.Field('id', v['id'] , lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add(lucene.Field('alias', json.dumps(v['alias']) , lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add(lucene.Field('type', v['type'] , lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add(lucene.Field('contained_by', json.dumps(v['contained_by']) , lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add(lucene.Field('images', json.dumps(v['images']) , lucene.Field.Store.YES, lucene.Field.Index.NO)) writer.addDocument(doc) writer.commit() writer.close()
def testUpdate(self): self.assertEqual(1, self.getHitCount("city", "Amsterdam")) reader = IndexReader.open(self.dir, False) reader.deleteDocuments(Term("city", "Amsterdam")) reader.close() writer = IndexWriter(self.dir, self.getAnalyzer(), False, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", "Russia", Field.Store.YES, Field.Index.NO)) doc.add(Field("contents", "St. Petersburg has lots of bridges", Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("city", "St. Petersburg", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.commit() writer.close() self.assertEqual(0, self.getHitCount("city", "Amsterdam")) self.assertEqual(1, self.getHitCount("city", "Petersburg"))
def index_plain_text_emails(data_folder, path_index_file, store_dir, lemmatize = False, stem = False, nonascii = True): ''' Indexes all the plain text emails in the input directory and stores the index in the store_dir Arguments: data_folder - input directory absolute path path_index_file - file paths index file store_dir - index store directory absolute path Returns: None ''' if not os.path.exists(store_dir): os.mkdir(store_dir) if os.path.exists(path_index_file): logging.info('Loading file paths index...') file_tuples = load_file_paths_index(path_index_file) logging.info('%d files found in the file paths index.' % len(file_tuples)) else: logging.info('Loading files in the data folder %s...' % data_folder) file_tuples = get_file_paths_index(data_folder) logging.info('%d email documents found.' % len(file_tuples)) store_file_paths_index(path_index_file, file_tuples) logging.info('File paths index is stored into %s' % path_index_file) logging.info('Lucene: Stem = %s, Lemmatize = %s, Number of documents = %d' % (stem, lemmatize, len(file_tuples))) store = SimpleFSDirectory(File(store_dir)) writer = IndexWriter(store, STD_ANALYZER, True, IndexWriter.MaxFieldLength.LIMITED) print 'Lucene:', len(file_tuples), 'files found in %s.' % data_folder print 'Lucene: Stem =', stem, 'Lemmatize =', lemmatize, 'Allow non-ASCII =', nonascii for ft in file_tuples: idx, root, file_name = ft file_path = os.path.join(root, file_name) logging.info("[%d] file: %s - adding to Lucene index.", idx, file_name) # parses the emails in plain text format receiver, sender, cc, subject, message_text, bcc, date, email_text = parse_plain_text_email(file_path, tokenize = True, lemmatize = lemmatize, stem = stem, nonascii = nonascii) doc = Document() doc.add(Field(MetadataType.FILE_ID, str(idx), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field(MetadataType.FILE_NAME, file_name, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.YES)) doc.add(Field(MetadataType.FILE_PATH, file_path, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add(Field(MetadataType.EMAIL_RECEIVER, receiver, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add(Field(MetadataType.EMAIL_SENDER, sender, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add(Field(MetadataType.EMAIL_CC, cc, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add(Field(MetadataType.EMAIL_SUBJECT, subject, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) #Subodh-Rahul - Added BCC field in indexing. doc.add(Field(MetadataType.EMAIL_BCC, bcc, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) #Subodh-Rahul - Added Email-Date field in indexing doc.add(Field(MetadataType.EMAIL_DATE, date, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) if len(message_text) > 0: doc.add(Field(MetadataType.EMAIL_BODY, message_text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) else: logging.error("[%d] file: %s - body text is empty.", idx, file_name) # Adds all documents fields as a separate index so that we can search through them doc.add(Field(MetadataType.ALL, email_text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) writer.addDocument(doc) logging.info("[%d] file: %s - added to Lucene index.", idx, file_name) writer.commit() writer.close() logging.info('Lucene: All files are indexed.')
def main(cls, argv): if len(argv) < 2: print "Usage: BerkeleyDbIndexer <index dir> -create" return dbHome = argv[1] create = len(argv) > 2 and argv[2] == "-create" if not os.path.exists(dbHome): os.makedirs(dbHome) elif create: for name in os.listdir(dbHome): if name.startswith('__'): os.remove(os.path.join(dbHome, name)) env = DBEnv() env.set_flags(DB_LOG_INMEMORY, 1); if os.name == 'nt': env.set_cachesize(0, 0x4000000, 1) elif os.name == 'posix': from commands import getstatusoutput if getstatusoutput('uname') == (0, 'Linux'): env.set_cachesize(0, 0x4000000, 1) env.open(dbHome, (DB_CREATE | DB_THREAD | DB_INIT_MPOOL | DB_INIT_LOCK | DB_INIT_TXN), 0) index = DB(env) blocks = DB(env) txn = None try: txn = env.txn_begin(None) index.open(filename = '__index__', dbtype = DB_BTREE, flags = DB_CREATE | DB_THREAD, txn = txn) blocks.open(filename = '__blocks__', dbtype = DB_BTREE, flags = DB_CREATE | DB_THREAD, txn = txn) except: if txn is not None: txn.abort() txn = None raise else: txn.commit() txn = None try: txn = env.txn_begin(None) directory = DbDirectory(txn, index, blocks, 0) writer = IndexWriter(directory, StandardAnalyzer(), create, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(False) doc = Document() doc.add(Field("contents", "The quick brown fox...", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.commit() writer.close() except: if txn is not None: txn.abort() txn = None raise else: txn.commit() index.close() blocks.close() env.close() print "Indexing Complete"
def index(cls, indexDir, taxoDir): """Create an index, and adds to it sample documents and facets. indexDir Directory in which the index should be created. taxoDir Directory in which the taxonomy index should be created. """ # create and open an index writer ver = lucene.Version.LUCENE_35 config = IndexWriterConfig(ver, WhitespaceAnalyzer(ver)) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) # create and open a taxonomy writer taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE) # loop over sample documents nDocsAdded = 0 nFacetsAdded = 0 for docNum in range(len(docTexts)): # obtain the sample facets for current document facets = categories[docNum] facetList = [createCategoryPath(f) for f in facets] # NOTE: setCategoryPaths() requires an Iterable, so need to convert the # Python list in order to to pass a proper argument to setCategoryPaths. # We use java.util.Arrays (via JCC) to create a Java List. # see http://docs.oracle.com/javase/1.5.0/docs/api/java/util/Arrays.html#asList(T...) facetList = lucene.Arrays.asList(facetList) # NOTE: we could use lucene.collections here as well in order to convert our # Python list to a Java based list using the JavaList class (JavaList implements # java.util.List around a Python list instance it wraps): # from lucene.collections import JavaList # facetList = JavaList(facetList) # we do not alter indexing parameters # a category document builder will add the categories to a document once build() is called categoryDocBuilder = CategoryDocumentBuilder( taxo).setCategoryPaths(facetList) # create a plain Lucene document and add some regular Lucene fields to it doc = Document() doc.add( Field(TITLE, docTitles[docNum], Field.Store.YES, Field.Index.ANALYZED)) doc.add( Field(TEXT, docTexts[docNum], Field.Store.NO, Field.Index.ANALYZED)) # invoke the category document builder for adding categories to the document and, # as required, to the taxonomy index categoryDocBuilder.build(doc) # finally add the document to the index iw.addDocument(doc) nDocsAdded += 1 nFacetsAdded += facetList.size() # end for # commit changes. # we commit changes to the taxonomy index prior to committing them to the search index. # this is important, so that all facets referred to by documents in the search index # will indeed exist in the taxonomy index. taxo.commit() iw.commit() # close the taxonomy index and the index - all modifications are # now safely in the provided directories: indexDir and taxoDir. taxo.close() iw.close() print "Indexed %d documents with overall %d facets." % (nDocsAdded, nFacetsAdded)
if __name__ == '__main__': initVM() directory = RAMDirectory() iwriter = IndexWriter(directory, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) ts = ["this bernhard is the text to be index text", "this claudia is the text to be index"] for t in ts: doc = Document() doc.add(Field("fieldname", t, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) iwriter.addDocument(doc) iwriter.commit() iwriter.close() ireader = IndexReader.open(directory, True) tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'fieldname')) for (t,f,i) in zip(tpv.getTerms(),tpv.getTermFrequencies(),xrange(100000)): print 'term %s' % t print ' freq: %i' % f try: print ' pos: ' + str([p for p in tpv.getTermPositions(i)]) except: print ' no pos' try: print ' off: ' + \ str(["%i-%i" % (o.getStartOffset(), o.getEndOffset())
doc.add(Field("synopsis", synopsis.strip(), Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("keywords", ' '.join((command, name, synopsis, description)), Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("filename", os.path.abspath(path), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) if __name__ == '__main__': if len(sys.argv) != 2: print "Usage: python manindex.py <index dir>" else: initVM() indexDir = sys.argv[1] writer = IndexWriter(SimpleFSDirectory(File(indexDir)), StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) manpath = os.environ.get('MANPATH', '/usr/share/man').split(os.pathsep) for dir in manpath: print "Crawling", dir for name in os.listdir(dir): path = os.path.join(dir, name) if os.path.isdir(path): indexDirectory(path) writer.commit() writer.close()