def createIndex(): #initialize lucene and jvm print("started indexer") lucene.initVM() indexDir = "/Tmp/REMOVEME.index-dir" #get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_30) #get index storage dir = lucene.SimpleFSDirectory(lucene.File(indexDir)) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) src_dir = 'html_files' i = 0 for l in os.listdir(src_dir): l = os.path.join(src_dir, l) with open(l, 'r') as myfile: data=myfile.read() i += 1 document, errors = parsehtml(data) doc = Document() doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def luceneIndexer(contents): lucene.initVM() INDEXIDR= settings.INDEX_DIR indexdir= SimpleFSDirectory(File(INDEXIDR)) analyzer= StandardAnalyzer(Version.LUCENE_30) index_writer= IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in contents: print"Indexing: ", tfile document= Document() content= tfile.getvalue() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print"Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def configure_lucene(): f = open('clique.txt', 'r') lucene.initVM() print 'Inside Function' #indexDir = "/tmp/luceneindex" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs( ) print >> sys.stderr, "Reading lines from sys.stdin..." for line in f: line = line.replace('\t', '') line = line.replace('\r', '') line = line.replace('\n', '') line = line.replace('^', '') line = line.strip() doc = Document() doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % ( writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs( ) writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs( ) print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def index_files(files, index_directory): lucene.initVM() d = SimpleFSDirectory(File(index_directory)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(d, analyzer, True, IndexWriter.MaxFieldLength(512)) for f in files: parse_file(f, writer) writer.optimize() writer.close()
def luceneIndexer(docdir, indir): """ IndexDocuments from a directory """ lucene.initVM() DIRTOINDEX = docdir INDEXIDR = indir indexdir = SimpleFSDirectory(File(INDEXIDR)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer= IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')): print "Indexing: ", tfile document = Document() content = open(tfile, 'r').read() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def addDocuments(self, dir, maxFieldLength): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength(maxFieldLength)) for keyword, unindexed, unstored, text in \ izip(self.keywords, self.unindexed, self.unstored, self.text): doc = Document() doc.add(Field("id", keyword, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", unindexed, Field.Store.YES, Field.Index.NO)) doc.add(Field("contents", unstored, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("city", text, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def index(self, doc, title, department, url): indexdir = SimpleFSDirectory(File(self.indir)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer = IndexWriter(indexdir, analyzer, self.init, IndexWriter.MaxFieldLength(512)) self.init = False # Initialize document and index it document = Document() document.add( Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) document.add(Field("url", url, Field.Store.YES, Field.Index.ANALYZED)) document.add( Field("department", department, Field.Store.YES, Field.Index.ANALYZED)) document.add(Field("text", doc, Field.Store.YES, Field.Index.ANALYZED)) index_writer.addDocument(document) index_writer.optimize() index_writer.close()
def luceneIndexer(docdir, indir): """frpFile IndexDocuments from a directory para:{ docdir: the path of the txt file indir: the path of the index file which is generated by the following code } """ lucene.initVM() DIRTOINDEX = docdir INDEXIDR = indir indexdir = SimpleFSDirectory(File(INDEXIDR)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer = IndexWriter(indexdir, analyzer, True, \ IndexWriter.MaxFieldLength(512)) #for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')): list = os.listdir(DIRTOINDEX) for i in range(len(list)): tfile = os.path.join(DIRTOINDEX, list[i]) if os.path.isfile(tfile): print ("Indexing: ", tfile) print ('okokokook') document = Document() content = open(tfile, 'r').read() document.add(Field("text", content, Field.Store.YES, \ Field.Index.ANALYZED)) document.add(Field("title",str(tfile.strip('.txt')),Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) #print (document) print ("Done: ", tfile) index_writer.optimize() print (index_writer.numDocs()) index_writer.close()
def __init__(self, indir): lucene.initVM() indexdir = SimpleFSDirectory(File(indir)) self.index_writer = IndexWriter(indexdir, self.getAnalyzer(), True, IndexWriter.MaxFieldLength(512))
from common.stats import stats import lucene from lucene import \ SimpleFSDirectory, System, File, \ Document, Field, StandardAnalyzer, IndexWriter, Version if __name__ == "__main__": lucene.initVM() # create an index called 'index-dir' in a temp directory # indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'), # 'index-dir') indexDir = "/Tmp/REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) # # set variables that affect speed of indexing # writer.setMergeFactor(int(argv[2])) # writer.setMaxMergeDocs(int(argv[3])) # writer.setMaxBufferedDocs(int(argv[4])) # # writer.infoStream = System.out # # print "Merge factor: ", writer.getMergeFactor() # print "Max merge docs:", writer.getMaxMergeDocs() # print "Max buffered docs:", writer.getMaxBufferedDocs() print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs( ) i = 0
def indexFile(self): self._th=lucene.initVM() self._analyzer = StandardAnalyzer(Version.LUCENE_36) self._dir = RAMDirectory() self._writer = IndexWriter(self._dir, self._analyzer, True, IndexWriter.MaxFieldLength(25000))
import lucene import json from lucene import (SimpleFSDirectory, System, File, Document, Field, StandardAnalyzer, IndexWriter, IndexSearcher, QueryParser) if __name__ == "__main__": lucene.initVM() fullIndexDir = r"c:\NLP\PhD\bob\fileDB\LuceneFullIndex" print("lucene version is:", lucene.VERSION) fullIndex = SimpleFSDirectory(File(fullIndexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(fullIndex, analyzer, True, IndexWriter.MaxFieldLength(20000000)) ## writer = IndexWriter(store, analyzer, True, IndexWriter.MaxFieldLength(512)) print("Currently there are %d documents in the index..." % writer.numDocs()) ## print "Reading lines from sys.stdin..." lines = [ "bla bla bla bla bla", "Erase una vez que se era", "En un lugar de La Mancha de cuyo nombre no quiero acordarme, no ha mucho que vivia un hidalgo de los de lanza en ristre", "Manchame mancha mancha que te mancha la mancha" ] for l in lines: doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))