def configure_lucene(): f = open('clique.txt','r') lucene.initVM() print 'Inside Function' #indexDir = "/tmp/luceneindex" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs() print >> sys.stderr, "Reading lines from sys.stdin..." for line in f: line = line.replace('\t','') line = line.replace('\r','') line = line.replace('\n','') line = line.replace('^','') line = line.strip() doc = Document() doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs() writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs() print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def index( self ): lucene.initVM() indexdir = SimpleFSDirectory( File( self.INDEX_DIR ) ) analyzer = StandardAnalyzer( Version.LUCENE_30 ) index_writer = IndexWriter( indexdir, analyzer, True, IndexWriter.MaxFieldLength( 512 ) ) # read input files (.xml) for in_file in glob.glob( os.path.join( self.DOC_DIR, '*.xml' ) ): corpus = codecs.open( in_file, encoding='utf-8' ).read() d = pq( corpus, parser='html' ) for text in d( 'Article' ).items(): document = Document() # find ID art_id = str( text.attr( 'articleid' ).encode( 'utf-8' ) ).replace( '+', '-' ) # find Title art_title = self.stem( str( text.attr( 'title' ).encode( 'utf-8' ) ) ) # find Abstract art_abstract = self.stem( str( text.find( 'Abstract' ).html().encode('utf-8') ) ) # find Keyword art_keyword = text.find( 'Keyword' ).html().encode('utf-8') # find Content art_content = self.stem( str( text.find( 'Content' ).html().encode('utf-8') ) ) # find Authors art_authors = text.find( 'Authors' ).html().encode('utf-8') document.add( Field( 'id', art_id, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'title', art_title, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'abstract', art_abstract, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'keyword', art_keyword, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'content', art_content, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'authors', art_authors, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'article', art_title + art_abstract + art_keyword + art_content,\ Field.Store.YES,\ Field.Index.ANALYZED ) ) index_writer.addDocument( document ) index_writer.optimize() index_writer.close()
def index(string): lucene.initVM() indexDir = "REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) try: writer = IndexWriter(dir, analyzer, False, IndexWriter.MaxFieldLength(512)) except lucene.JavaError: #print 'Inside Index Except' writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) #e = sys.exc_info()[0] #print e #print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs() doc = Document() doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) #print 'In the index function' #print writer.numDocs() #print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs()) #print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs() writer.optimize() #print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs() #print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() #print 'ending Indexing' #print string #print 'Total indexes' #print writer.numDocs() writer.close()
def main(cls, argv): if len(argv) < 5: print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>" return docsInIndex = int(argv[1]) # create an index called 'index-dir' in a temp directory indexDir = os.path.join(tempfile.gettempdir(), 'index-dir') dir = FSDirectory.open(indexDir,) analyzer = SimpleAnalyzer() writer = IndexWriter(dir, analyzer, True) # set variables that affect speed of indexing writer.setMergeFactor(int(argv[2])) writer.setMaxMergeDocs(int(argv[3])) writer.setMaxBufferedDocs(int(argv[4])) # writer.infoStream = tempfile.out print "Merge factor: ", writer.getMergeFactor() print "Max merge docs:", writer.getMaxMergeDocs() print "Max buffered docs:", writer.getMaxBufferedDocs() start = time() for i in xrange(docsInIndex): doc = Document() doc.add(Field("fieldname", "Bibamus", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() print "Time: ", timedelta(seconds=time() - start)
def indexDocuments(): # empty index directory indexDir = Wikipedia.directory + 'index/' for filename in os.listdir(indexDir): os.remove(indexDir + filename) # index documents lucene.initVM() version = Version.LUCENE_CURRENT analyzer = EnglishAnalyzer(version) writer = IndexWriter(SimpleFSDirectory(File(indexDir)), analyzer, True, IndexWriter.MaxFieldLength.LIMITED) for article in Wikipedia(): doc = Document() doc.add( Field('id', str(article['id'][0]), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('title', article['url'], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('content', article['text'], Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) print 'Optimization' writer.optimize() writer.close()
def main(cls, argv): if len(argv) < 5: print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>" return docsInIndex = int(argv[1]) # create an index called 'index-dir' in a temp directory indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'), 'index-dir') dir = FSDirectory.getDirectory(indexDir, True) analyzer = SimpleAnalyzer() writer = IndexWriter(dir, analyzer, True) # set variables that affect speed of indexing writer.setMergeFactor(int(argv[2])) writer.setMaxMergeDocs(int(argv[3])) writer.setMaxBufferedDocs(int(argv[4])) # writer.infoStream = System.out print "Merge factor: ", writer.getMergeFactor() print "Max merge docs:", writer.getMaxMergeDocs() print "Max buffered docs:", writer.getMaxBufferedDocs() start = time() for i in xrange(docsInIndex): doc = Document() doc.add( Field("fieldname", "Bibamus", Field.Store.YES, Field.Index.TOKENIZED)) writer.addDocument(doc) writer.close() print "Time: ", timedelta(seconds=time() - start)
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) # Elwood document = Document() document.add( Field("owner", "elwood", Field.Store.YES, Field.Index.NOT_ANALYZED)) document.add( Field("keywords", "elwoods sensitive info", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(document) # Jake document = Document() document.add( Field("owner", "jake", Field.Store.YES, Field.Index.NOT_ANALYZED)) document.add( Field("keywords", "jakes sensitive info", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(document) writer.close()
def index(source, indexName): if(not os.path.exists(indexName)): os.mkdir(indexName) indexDir = File(indexName) writer = IndexWriter(SimpleFSDirectory(File(indexName)),StandardAnalyzer(Version.LUCENE_CURRENT), True,IndexWriter.MaxFieldLength.LIMITED) p = re.compile("(GH\d+\-\d+)\n(.*?)\n+", re.DOTALL) res = p.findall(source) i = 0 for pair in res: i += 1 doc = Document() doc.add(Field("id", pair[0], Field.Store.YES, Field.Index.NO)) for t in pair[1].split(): doc.add(Field("content", t.replace("-","_"), Field.Store.NO, Field.Index.NOT_ANALYZED)); #doc.add(Field("content", pair[1], Field.Store.NO, Field.Index.ANALYZED)); writer.addDocument(doc) writer.close() print str(i)+ " docs indexed"
def luceneIndexer(contents): lucene.initVM() INDEXIDR= settings.INDEX_DIR indexdir= SimpleFSDirectory(File(INDEXIDR)) analyzer= StandardAnalyzer(Version.LUCENE_30) index_writer= IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in contents: print"Indexing: ", tfile document= Document() content= tfile.getvalue() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print"Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def testUpdate(self): self.assertEqual(1, self.getHitCount("city", "Amsterdam")) reader = IndexReader.open(self.dir, False) reader.deleteDocuments(Term("city", "Amsterdam")) reader.close() writer = IndexWriter(self.dir, self.getAnalyzer(), False, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", "Russia", Field.Store.YES, Field.Index.NO)) doc.add( Field("contents", "St. Petersburg has lots of bridges", Field.Store.NO, Field.Index.ANALYZED)) doc.add( Field("city", "St. Petersburg", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close() self.assertEqual(0, self.getHitCount("city", "Amsterdam")) self.assertEqual(1, self.getHitCount("city", "Petersburg"))
def index(self,path_to_index,path_files): 'indexes anchor texts from a given folder' #lucene.initVM() indexDir = path_to_index directory_index = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_35) writer = IndexWriter(directory_index, analyzer, True, IndexWriter.MaxFieldLength(512)) listOfPathes = [] listOfPathes.extend(glob.glob(path_files+"*.txt")) counter = 0 for path_to_file in listOfPathes: print path_to_file f = open(path_to_file,"r") for line in f: entry = line.split("\t") counter+=1 """ optimizes index after a certain amount of added documents """ if counter%500000==0: print counter writer.optimize() doc = Document() doc.add(Field("anchor", entry[0], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("anchor_uri", entry[1], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("dbpedia_uri", entry[2], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("number", entry[3].replace("\n",""), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() f.close() writer.close() print counter print "done"
def addDocuments(self, dir): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) # # change to adjust performance of indexing with FSDirectory # writer.mergeFactor = writer.mergeFactor # writer.maxMergeDocs = writer.maxMergeDocs # writer.minMergeDocs = writer.minMergeDocs # for word in self.docs: doc = Document() doc.add( Field("keyword", word, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO)) doc.add( Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
class BuildIndex: def __init__(self, indir): lucene.initVM() indexdir = SimpleFSDirectory(File(indir)) self.index_writer = IndexWriter(indexdir, self.getAnalyzer(), True, IndexWriter.MaxFieldLength(512)) def getAnalyzer(self): return ChineseAnalyzer(lucene.Version.LUCENE_CURRENT) def addDocuments(self, _id, title, content): doc = Document() doc.add(Field("id", _id, Field.Store.YES, Field.Index.NOT_ANALYZED)) if title is not None and len(title) > 0: doc.add( Field("titleKeyword", title, Field.Store.NO, Field.Index.ANALYZED)) if content is not None and len(content) > 0: doc.add( Field("contentKeyword", content, Field.Store.NO, Field.Index.ANALYZED)) self.index_writer.addDocument(doc) def close(self): self.index_writer.optimize() self.index_writer.close()
def configure_lucene(): f = open('clique.txt', 'r') lucene.initVM() print 'Inside Function' #indexDir = "/tmp/luceneindex" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs( ) print >> sys.stderr, "Reading lines from sys.stdin..." for line in f: line = line.replace('\t', '') line = line.replace('\r', '') line = line.replace('\n', '') line = line.replace('^', '') line = line.strip() doc = Document() doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % ( writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs( ) writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs( ) print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def setUp(self): animals = [ "aardvark", "beaver", "coati", "dog", "elephant", "frog", "gila monster", "horse", "iguana", "javelina", "kangaroo", "lemur", "moose", "nematode", "orca", "python", "quokka", "rat", "scorpion", "tarantula", "uromastyx", "vicuna", "walrus", "xiphias", "yak", "zebra" ] analyzer = WhitespaceAnalyzer() aTOmDirectory = RAMDirectory() nTOzDirectory = RAMDirectory() aTOmWriter = IndexWriter(aTOmDirectory, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) nTOzWriter = IndexWriter(nTOzDirectory, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) for animal in animals: doc = Document() doc.add(Field("animal", animal, Field.Store.YES, Field.Index.NOT_ANALYZED)) if animal[0].lower() < "n": aTOmWriter.addDocument(doc) else: nTOzWriter.addDocument(doc) aTOmWriter.close() nTOzWriter.close() self.searchers = [ IndexSearcher(aTOmDirectory), IndexSearcher(nTOzDirectory) ]
def addDocuments(self, dir, isCompound): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(isCompound) # change to adjust performance of indexing with FSDirectory # writer.mergeFactor = writer.mergeFactor # writer.maxMergeDocs = writer.maxMergeDocs # writer.minMergeDocs = writer.minMergeDocs for word in self.docs: doc = Document() doc.add(Field("keyword", word, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO)) doc.add(Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def update_index_withLineArray(self,array): """ Parsed sentences (given in an array) are added to the index, with the corresponding two entities (x,y) and the DBpedia URI """ print "start adding sentences" writer = IndexWriter(index_directory, analyzer, False, IndexWriter.MaxFieldLength(512)) for item in array: line = item[0] x = item[1] y = item[2] uri = item[3] line=line.replace("\t"," ") line = line.replace("\n"," ") line = line.replace(" "," ") try: doc = Document() doc.add(Field("Sentence", line, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("X", x, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("Y", y, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("URI", uri, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) except Exception: print "Unexpected error:", sys.exc_info()[0] raw_input("Error in updating the Sentences") try: writer.optimize() except: print "Unexpected error:", sys.exc_info()[0] print ("could not optimize index") writer.close() print "all sentences added"
def setUp(self): self.directory = RAMDirectory() self.analyzer = WhitespaceAnalyzer() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) doc = Document() doc.add(Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True) self.reader = IndexReader.open(self.directory, True) self.quick = SpanTermQuery(Term("f", "quick")) self.brown = SpanTermQuery(Term("f", "brown")) self.red = SpanTermQuery(Term("f", "red")) self.fox = SpanTermQuery(Term("f", "fox")) self.lazy = SpanTermQuery(Term("f", "lazy")) self.sleepy = SpanTermQuery(Term("f", "sleepy")) self.dog = SpanTermQuery(Term("f", "dog")) self.cat = SpanTermQuery(Term("f", "cat"))
def do_index(): initVM() indexDir = "/home/william/woyaoo/luceneindex" version = Version.LUCENE_CURRENT standardAnalyzer = StandardAnalyzer(version) # chineseAnalyzer = CJKAnalyzer(version) engine = data.engine_from_config("indexdb.config") # engine = data.engine_from_config() db = data.init_datafactory(engine) docs = dbfactory.Session().query(doc_model.Doc).filter(doc_model.Doc.dateCreated > "20121220").all() print len(docs) idxDir = SimpleFSDirectory(File(indexDir)) perIndexCount = 5000 writer = IndexWriter(idxDir, standardAnalyzer, True, IndexWriter.MaxFieldLength(512)) # add field for doc in docs: # print repr(doc.description) lucenedoc = Document() descriptionValue = doc.description.strip("\r\n").encode("UTF-8") # descriptionValue ='中国 abc' print repr(descriptionValue) lucenedoc.add(Field("url", doc.url, Field.Store.YES, Field.Index.NOT_ANALYZED)) lucenedoc.add(Field("intent", doc.intent, Field.Store.YES, Field.Index.NOT_ANALYZED)) # lucenedoc.add(Field('description', doc.description, Field.Store.YES, Field.Index.ANALYZED)) lucenedoc.add(Field("description", descriptionValue, Field.Store.YES, Field.Index.ANALYZED)) lucenedoc.add(Field("title", doc.title, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(lucenedoc) writer.optimize() writer.close() print "index finished"
def createIndex(): #initialize lucene and jvm print("started indexer") lucene.initVM() indexDir = "/Tmp/REMOVEME.index-dir" #get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_30) #get index storage dir = lucene.SimpleFSDirectory(lucene.File(indexDir)) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) src_dir = 'html_files' i = 0 for l in os.listdir(src_dir): l = os.path.join(src_dir, l) with open(l, 'r') as myfile: data=myfile.read() i += 1 document, errors = parsehtml(data) doc = Document() doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def setUp(self): self.directory = RAMDirectory() self.analyzer = WhitespaceAnalyzer() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) doc = Document() doc.add( Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True) self.reader = IndexReader.open(self.directory, True) self.quick = SpanTermQuery(Term("f", "quick")) self.brown = SpanTermQuery(Term("f", "brown")) self.red = SpanTermQuery(Term("f", "red")) self.fox = SpanTermQuery(Term("f", "fox")) self.lazy = SpanTermQuery(Term("f", "lazy")) self.sleepy = SpanTermQuery(Term("f", "sleepy")) self.dog = SpanTermQuery(Term("f", "dog")) self.cat = SpanTermQuery(Term("f", "cat"))
def addContents(self,contents): try: #iwconfig = IndexWriterConfig(SimpleAnalyzer(),IndexWriter.MaxFieldLength.LIMITED) writer = IndexWriter(self.ramIndex,SimpleAnalyzer(Version.LUCENE_CURRENT),True,IndexWriter.MaxFieldLength.LIMITED) for content in contents: doc = Document() doc.add(Field("contents",content[1],Field.Store.NO,Field.Index.ANALYZED,Field.TermVector.YES)) writer.addDocument(doc) writer.close() except Exception,e: print 'Unable to add content to RAM index'
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, self.porterAnalyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("contents", "The quick brown fox jumps over the lazy dogs", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close()
def indexSingleFieldDocs(self, fields): writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) for field in fields: doc = Document() doc.add(field) writer.addDocument(doc) writer.optimize() writer.close()
def indexSingleFieldDocs(self, fields): writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) for field in fields: doc = Document() doc.add(field) writer.addDocument(doc) writer.commit() writer.close()
def setUp(self): # set up sample document directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(directory)
def setUp(self): # set up sample document directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(directory)
def luceneIndexer(docdir,indir): """ IndexDocuments from a directory. Args: docdir:文档所在文件夹 indir:索引存放文件夹 Returns: 无返回值 说明: FieldType().setStored=as-is value stored in the Lucene index FieldType().setTokenized=field is analyzed using the specified Analyzer - the tokens emitted are indexed FieldType().Indexed = the text (either as-is with keyword fields, or the tokens from tokenized fields) is made searchable (aka inverted) FieldType().Vectored = term frequency per document is stored in the index in an easily retrievable fashion. """ """#类型1属性:对于需要检索,需要返回显示setStored(True) type1 = FieldType() type1.setIndexed(True) type1.setStored(True) type1.setTokenized(False) type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) #类型2属性:对于不用返回显示,但是需要进行检索的字段。这里我认为文本内容(content)是这一种的,通常例如文件的META信息。 type2 = FieldType() type2.setIndexed(True) type2.setStored(False) type2.setTokenized(True) type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)""" lucene.initVM() DIRTOINDEX= docdir INDEXIDR= indir indexdir= SimpleFSDirectory(File(INDEXIDR)) analyzer= StandardAnalyzer(Version.LUCENE_30) #用指定的语言分析器构造一个新的写索引器. index_writer= IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')): #print "Indexing: " print "Indexing:", tfile; document = Document() content = open(tfile,'r').read() #类型使用方式 #doc.add(Field("path", tfile, type1)) #文档新增字段(Field){字段名:"text",存储:“YES”,索引:"YES"} document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED)) document.add(Field("path",tfile,Field.Store.YES,Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def setUp(self): self.analyzer = WhitespaceAnalyzer() self.directory = RAMDirectory() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.LIMITED) for i in xrange(1, 501): doc = Document() doc.add(Field("id", NumberUtils.pad(i), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) writer.close()
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("partnum", "Q36", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("description", "Illidium Space Modulator", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = self.getSearcher()
def initDummyStore(self, directory): """Open a dummy ramdirectory for testing.""" writer = IndexWriter(directory, SimpleAnalyzer(), True) doc = Document() doc.add(Field("name", 'dummy.txt', Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add(Field("path", '/path/to/dummy.txt', Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add(Field("path", '/path/to/another/dummy.txt', Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add(Field("contents", "foo dummy bar", Field.Store.YES, Field.Index.TOKENIZED)) writer.addDocument(doc) writer.optimize() writer.close()
def create_index(self,path_to_index): """ Creates new Index """ print "Create new Index" path = SimpleFSDirectory(File(path_to_index)) analyzer = StandardAnalyzer(Version.LUCENE_35) writer = IndexWriter(path, analyzer, True, IndexWriter.MaxFieldLength(512)) doc = Document() doc.add(Field("Sentence", "Hello World", Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("X", "x", Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("Y", "y", Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("URI", "uri", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close()
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("partnum", "Q36", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("description", "Illidium Space Modulator", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True)
def Indexer(docdir,indir): lucene.initVM() DIRTOINDEX = docdir INDEXDIR = indir indexdir = FSDirectory(File(INDEXDIR)) analyzer = StandardAnalyzer(VERSION.LUCENE_30) index_writer = IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')): print "Indexing ",tfile document=Document() content = open(tfile,'r').read() document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done" index_writer.optimize() print index_writer.numDocs() index_writer.close()
def main(cls, argv): if len(argv) != 3: print "Usage: T9er <WordNet index dir> <t9 index>" return for key in cls.keys: c = key[0] k = key[1:] for kc in k: cls.keyMap[kc] = c print kc, "=", c indexDir = argv[1] t9dir = argv[2] reader = IndexReader.open(indexDir) numDocs = reader.maxDoc() print "Processing", numDocs, "words" writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True) for id in xrange(reader.maxDoc()): origDoc = reader.document(id) word = origDoc.get("word") if word is None or len(word) == 0: continue newDoc = Document() newDoc.add( Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add( Field("t9", cls.t9(word), Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add( Field("length", str(len(word)), Field.Store.NO, Field.Index.UN_TOKENIZED)) writer.addDocument(newDoc) if id % 100 == 0: print "Document", id writer.optimize() writer.close() reader.close()
def index(self): dirPath = os.path.join(System.getProperty("java.io.tmpdir", "tmp"), "verbose-index") dir = FSDirectory.getDirectory(dirPath, True) writer = IndexWriter(dir, SimpleAnalyzer(), True) writer.setInfoStream(System.out) for i in xrange(100): doc = Document() doc.add(Field("keyword", "goober", Field.Store.YES, Field.Index.UN_TOKENIZED)) writer.addDocument(doc) writer.optimize() writer.close()
def lucene_index(input_folder,output_folder): ''' Indexes fresh text data using lucene 3.6. Doesn't support incremental generation of index as of now. Currently crashes on neo by running out of heap space. Arguments: Input folder for text files. output folder for index location Returns: void. The index is stored if generated. ''' # Setting up log file logging.basicConfig(file=os.path.join(output_folder,"lucene_index.log")) logging.info("Input directory for logging: "+input_folder) logging.info("Output directory of index: "+output_folder) if not os.path.isdir(output_folder): logger.debug("Making output directory for index: "+ output_folder) os.makedirs(output_folder) # Setting up lucene's heap size for index and version of indexer lucene.initVM(initialheap='1024m',maxheap='2048m') index_folder = SimpleFSDirectory(File(output_folder)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(index_folder, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) # Optimization to reduce heap space usage for generation of index. Merges buffer with # current index after 15 docs. writer.setMergeFactor(15) writer.setRAMBufferSizeMB(32.0) # Search to find the files to index files_to_index = find_files_in_folder(input_folder) for input_file in files_to_index: doc = Document() content = open(input_file, 'r').read() doc.add(Field("text", content, Field.Store.NO, Field.Index.ANALYZED)) # Do not store text.Only index. doc.add(Field("path", input_file, Field.Store.YES, Field.Index.NO)) # Store path to assist in retreiving the file writer.addDocument(doc) # Index logger.info("Indexed lines from " +input_folder+" (%d documents in index)" % (writer.numDocs())) logger.info( "About to optimize index of %d documents..." % writer.numDocs()) writer.optimize() # Compress index logger.info("...done optimizing index of %d documents" % writer.numDocs()) logger.info("Closing index of %d documents..." % writer.numDocs()) writer.close() logger.info("Closed index")
def index(self): dirPath = os.path.join(tempfile.gettempdir(), "verbose-index") dir = FSDirectory.open(dirPath) writer = IndexWriter(dir, SimpleAnalyzer(), True) writer.setInfoStream(InfoStreamOut()) for i in xrange(100): doc = Document() doc.add(Field("keyword", "goober", Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def setUp(self): directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc1 = Document() doc1.add(Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc1) doc2 = Document() doc2.add(Field("field", "the fast fox hopped over the hound", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc2) writer.close() self.searcher = IndexSearcher(directory, True)
def main(cls, argv): if len(argv) != 3: print "Usage: T9er <WordNet index dir> <t9 index>" return for key in cls.keys: c = key[0] k = key[1:] for kc in k: cls.keyMap[kc] = c print kc, "=", c indexDir = argv[1] t9dir = argv[2] reader = IndexReader.open(indexDir) numDocs = reader.maxDoc() print "Processing", numDocs, "words" writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True) for id in xrange(reader.maxDoc()): origDoc = reader.document(id) word = origDoc.get("word") if word is None or len(word) == 0: continue newDoc = Document() newDoc.add(Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add(Field("t9", cls.t9(word), Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add(Field("length", str(len(word)), Field.Store.NO, Field.Index.UN_TOKENIZED)) writer.addDocument(newDoc) if id % 100 == 0: print "Document", id writer.commit() writer.close() reader.close()
def index(string): lucene.initVM() indexDir = "REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs() doc = Document() doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs() writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs() print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def luceneIndexer(docdir,indir): """ Index Documents from a dirrcory """ lucene.initVM() DIRTOINDEX = docdir INDEXIDR = indir indexdir = SimpleFSDirectory(File(INDEXIDR)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer = IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')): print "Indexing: ", tfile document = Document() content = open(tfile,'r').read() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def luceneIndexer(docdir, indir): """ IndexDocuments from a directory """ lucene.initVM() DIRTOINDEX = docdir INDEXIDR = indir indexdir = SimpleFSDirectory(File(INDEXIDR)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer= IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')): print "Indexing: ", tfile document = Document() content = open(tfile, 'r').read() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def setUp(self): directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc1 = Document() doc1.add( Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc1) doc2 = Document() doc2.add( Field("field", "the fast fox hopped over the hound", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc2) writer.close() self.searcher = IndexSearcher(directory, True)
def update_index_withLine(self,line,x,y,uri): """ Parsed sentence is added to the index, with the corresponding two entities (x,y) and the DBpedia URI """ line=line.replace("\t"," ") line = line.replace("\n"," ") line = line.replace(" "," ") try: writer = IndexWriter(index_directory, analyzer, False, IndexWriter.MaxFieldLength(512)) doc = Document() doc.add(Field("Sentence", line, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("X", x, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("Y", y, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("URI", uri, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close() except Exception: print "Unexpected error:", sys.exc_info()[0] raw_input("Error in updating the Sentences")
def addDocuments(self, dir, maxFieldLength): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength(maxFieldLength)) for keyword, unindexed, unstored, text in \ izip(self.keywords, self.unindexed, self.unstored, self.text): doc = Document() doc.add(Field("id", keyword, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", unindexed, Field.Store.YES, Field.Index.NO)) doc.add(Field("contents", unstored, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("city", text, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def index(self, doc, title, department, url): indexdir = SimpleFSDirectory(File(self.indir)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer = IndexWriter(indexdir, analyzer, self.init, IndexWriter.MaxFieldLength(512)) self.init = False # Initialize document and index it document = Document() document.add( Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) document.add(Field("url", url, Field.Store.YES, Field.Index.ANALYZED)) document.add( Field("department", department, Field.Store.YES, Field.Index.ANALYZED)) document.add(Field("text", doc, Field.Store.YES, Field.Index.ANALYZED)) index_writer.addDocument(document) index_writer.optimize() index_writer.close()
def someMethod(self): directory = RAMDirectory() analyzer = StandardAnalyzer() writer = IndexWriter(directory, analyzer, True) doc = Document() doc.add(Field.Text("title", "This is the title")) doc.add(Field.UnStored("contents", "...document contents...")) writer.addDocument(doc) writer.addDocument(doc, analyzer) expression = "some query" query = QueryParser.parse(expression, "contents", analyzer) parser = QueryParser("contents", analyzer) query = parser.parseQuery(expression)
def setUp(self): animals = [ "aardvark", "beaver", "coati", "dog", "elephant", "frog", "gila monster", "horse", "iguana", "javelina", "kangaroo", "lemur", "moose", "nematode", "orca", "python", "quokka", "rat", "scorpion", "tarantula", "uromastyx", "vicuna", "walrus", "xiphias", "yak", "zebra" ] analyzer = WhitespaceAnalyzer() aTOmDirectory = RAMDirectory() nTOzDirectory = RAMDirectory() aTOmWriter = IndexWriter(aTOmDirectory, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) nTOzWriter = IndexWriter(nTOzDirectory, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) for animal in animals: doc = Document() doc.add( Field("animal", animal, Field.Store.YES, Field.Index.NOT_ANALYZED)) if animal[0].lower() < "n": aTOmWriter.addDocument(doc) else: nTOzWriter.addDocument(doc) aTOmWriter.close() nTOzWriter.close() self.searchers = [ IndexSearcher(aTOmDirectory), IndexSearcher(nTOzDirectory) ]
def addDocuments(self, dir): writer = IndexWriter(dir, self.getAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(self.isCompound()) for i in xrange(len(self.keywords)): doc = Document() doc.add( Field("id", self.keywords[i], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("country", self.unindexed[i], Field.Store.YES, Field.Index.NO)) doc.add( Field("contents", self.unstored[i], Field.Store.NO, Field.Index.ANALYZED)) doc.add( Field("city", self.text[i], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def luceneIndexer(docdir, indir): """frpFile IndexDocuments from a directory para:{ docdir: the path of the txt file indir: the path of the index file which is generated by the following code } """ lucene.initVM() DIRTOINDEX = docdir INDEXIDR = indir indexdir = SimpleFSDirectory(File(INDEXIDR)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer = IndexWriter(indexdir, analyzer, True, \ IndexWriter.MaxFieldLength(512)) #for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')): list = os.listdir(DIRTOINDEX) for i in range(len(list)): tfile = os.path.join(DIRTOINDEX, list[i]) if os.path.isfile(tfile): print ("Indexing: ", tfile) print ('okokokook') document = Document() content = open(tfile, 'r').read() document.add(Field("text", content, Field.Store.YES, \ Field.Index.ANALYZED)) document.add(Field("title",str(tfile.strip('.txt')),Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) #print (document) print ("Done: ", tfile) index_writer.optimize() print (index_writer.numDocs()) index_writer.close()
class StatisticsFromCats: def __init__(self, pages_xml, cats_xml): self._pages = pages_xml self._cats = cats_xml self._out_xml = None self._unique = 0 self._total_cats = 0 self._total = 0 self._total_art = 0 self._titles = [] self._analyzer = None self._dir = None self._writer = None self._th=None self._dick=None # openning our parsed data def do_job(self): with open('temp_output', 'w') as output: print('<?xml version="1.0" ?>\n<root>', file=output) with open(self._pages) as page: tree_articles = etree.iterparse(page, tag='data', remove_comments=True, remove_blank_text=True) with open(self._cats) as cats: print('working on it') tree_link = etree.iterparse(cats, tag='category_trunc', remove_blank_text=True) dict_link = {} self._dick = {} full_name_dict_link = {} fast_iter(tree_link, self.gethash, dict_link, full_name_dict_link) self.indexFile() fast_iter(tree_articles, self.findcats, output, dict_link, full_name_dict_link, self._dick) print("<total>" + str(self._total_cats) + "</total>", file=output) print("<unique>" + str(self._unique) + "</unique>", file=output) print("<avg>" + str(self._unique / self._total_art) + "</avg>", file=output) print("<onetimeunique>" + str(len(self._dick)) + "</onetimeunique>", file=output) print("</root>", file=output) self._writer.close() # with open('temp_output','r') as output: # output = etree.iterparse(output,tag='data',remove_comments=True) # fast_iter(output,self.createIndexing) #getting total of cats def get_total(self): return str(self._total_cats) #geting number of unique def get_unique(self): return str(self._unique) #geting number of unique def get_avg(self): return str(self._unique / self._total_art) #geting number of unique def get_one_time_unique(self): return str(len(self._dick)) #search function def find_spec_cat(self, title): title = title.replace('\n', '') return self.query(title) #finding categories in ssql based on id form xml def findcats(self, elem, output, dict, dict_all, dict_unique): id_atr = elem.get('article_id') self._total_art += 1 unique = 0 content='XML\n' try: found = dict[id_atr] except KeyError: found = [] self._total += 1 data = etree.Element('data') data.set('article_id', id_atr) data.set('title', elem.findtext('title')) self._titles.append(elem.findtext('title')) for catinarticle in elem.findall('category'): self._total_cats += 1 if catinarticle is not None and catinarticle.text is not None: try: content+=catinarticle.text+"\n" trimCat = catinarticle.text.replace(' ', '').replace('\'', '') if trimCat not in found: unique += 1 dict_unique[trimCat] = 1 except etree.XPathEvalError: print(catinarticle.text.replace(' ', '').replace('\'', '')) sub = etree.SubElement(data, 'category') sub.text = catinarticle.text content+='\nSQL\n' if found: for text in dict_all[id_atr]: subdat = etree.SubElement(data, 'categoryDat') subdat.text = text content+=text+"\n" del dict[id_atr] del dict_all[id_atr] print(etree.tostring(data, encoding='utf-8'), file=output) self._unique += unique self.addElement(elem.findtext('title'),content) def gethash(self, elem, dict, full_name_dict_link): id_atr = elem.get('article_id') try: ret = dict[id_atr] retfull = dict[id_atr] except KeyError: ret = [] retfull = [] ret.append(elem.text) dict[id_atr] = ret full_name_dict_link[id_atr] = retfull self._total += 1 def createIndexing(self,elem): title=elem.get('title') content='XML\n' for catinarticle in elem.findall('category'): if catinarticle.text is not None: content+=catinarticle.text+'\n' content+='\nSQL\n' for catinsql in elem.findall('categoryDat'): if catinsql.text is not None: content+=catinsql.text+'\n' self.addElement(title,content) def indexFile(self): self._th=lucene.initVM() self._analyzer = StandardAnalyzer(Version.LUCENE_36) self._dir = RAMDirectory() self._writer = IndexWriter(self._dir, self._analyzer, True, IndexWriter.MaxFieldLength(25000)) def addElement(self,title,content): self._total_art += 1 doc = Document() doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("content", content, Field.Store.YES, Field.Index.ANALYZED)) self._writer.addDocument(doc) def query(self,title): self._th.attachCurrentThread() searcher = IndexSearcher(self._dir) query=QueryParser(Version.LUCENE_30, "title", self._analyzer).parse(title) total_hits = searcher.search(query, 10) for hit in total_hits.scoreDocs: doc = (searcher.doc(hit.doc)) return doc.get("title")+"\n"+doc.get("content")+"--------------------------------" return "None"
def index(cls, indexDir, taxoDir): """Create an index, and adds to it sample documents and facets. indexDir Directory in which the index should be created. taxoDir Directory in which the taxonomy index should be created. """ # create and open an index writer ver = lucene.Version.LUCENE_35 config = IndexWriterConfig(ver, WhitespaceAnalyzer(ver)) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) # create and open a taxonomy writer taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE) # loop over sample documents nDocsAdded = 0 nFacetsAdded = 0 for docNum in range(len(docTexts)): # obtain the sample facets for current document facets = categories[docNum] facetList = [createCategoryPath(f) for f in facets] # NOTE: setCategoryPaths() requires an Iterable, so need to convert the # Python list in order to to pass a proper argument to setCategoryPaths. # We use java.util.Arrays (via JCC) to create a Java List. # see http://docs.oracle.com/javase/1.5.0/docs/api/java/util/Arrays.html#asList(T...) facetList = lucene.Arrays.asList(facetList) # NOTE: we could use lucene.collections here as well in order to convert our # Python list to a Java based list using the JavaList class (JavaList implements # java.util.List around a Python list instance it wraps): # from lucene.collections import JavaList # facetList = JavaList(facetList) # we do not alter indexing parameters # a category document builder will add the categories to a document once build() is called categoryDocBuilder = CategoryDocumentBuilder( taxo).setCategoryPaths(facetList) # create a plain Lucene document and add some regular Lucene fields to it doc = Document() doc.add( Field(TITLE, docTitles[docNum], Field.Store.YES, Field.Index.ANALYZED)) doc.add( Field(TEXT, docTexts[docNum], Field.Store.NO, Field.Index.ANALYZED)) # invoke the category document builder for adding categories to the document and, # as required, to the taxonomy index categoryDocBuilder.build(doc) # finally add the document to the index iw.addDocument(doc) nDocsAdded += 1 nFacetsAdded += facetList.size() # end for # commit changes. # we commit changes to the taxonomy index prior to committing them to the search index. # this is important, so that all facets referred to by documents in the search index # will indeed exist in the taxonomy index. taxo.commit() iw.commit() # close the taxonomy index and the index - all modifications are # now safely in the provided directories: indexDir and taxoDir. taxo.close() iw.close() print "Indexed %d documents with overall %d facets." % (nDocsAdded, nFacetsAdded)
# print "Max merge docs:", writer.getMaxMergeDocs() # print "Max buffered docs:", writer.getMaxBufferedDocs() print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs( ) i = 0 print >> sys.stderr, "Reading lines from sys.stdin..." for l in sys.stdin: i += 1 if string.strip(l) == "": continue doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) if i % 10000 == 0: print >> sys.stderr, "Read %d lines from stdin (%d documents in index)..." % ( i, writer.numDocs()) print >> sys.stderr, stats() # if i > 100000: break print >> sys.stderr, "Indexed a total of %d lines from stdin (%d documents in index)" % ( i, writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs( ) print >> sys.stderr, stats() writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs( )
def main(cls, argv): if len(argv) < 2: print "Usage: BerkeleyDbIndexer <index dir> -create" return dbHome = argv[1] create = len(argv) > 2 and argv[2] == "-create" if not os.path.exists(dbHome): os.makedirs(dbHome) elif create: for name in os.listdir(dbHome): if name.startswith('__'): os.remove(os.path.join(dbHome, name)) env = DBEnv() env.set_flags(DB_LOG_INMEMORY, 1) if os.name == 'nt': env.set_cachesize(0, 0x4000000, 1) elif os.name == 'posix': from commands import getstatusoutput if getstatusoutput('uname') == (0, 'Linux'): env.set_cachesize(0, 0x4000000, 1) env.open(dbHome, (DB_CREATE | DB_THREAD | DB_INIT_MPOOL | DB_INIT_LOCK | DB_INIT_TXN), 0) index = DB(env) blocks = DB(env) txn = None try: txn = env.txn_begin(None) index.open(filename='__index__', dbtype=DB_BTREE, flags=DB_CREATE | DB_THREAD, txn=txn) blocks.open(filename='__blocks__', dbtype=DB_BTREE, flags=DB_CREATE | DB_THREAD, txn=txn) except: if txn is not None: txn.abort() txn = None raise else: txn.commit() txn = None try: txn = env.txn_begin(None) directory = DbDirectory(txn, index, blocks, 0) writer = IndexWriter(directory, StandardAnalyzer(), create, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(False) doc = Document() doc.add( Field("contents", "The quick brown fox...", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close() except: if txn is not None: txn.abort() txn = None raise else: txn.commit() index.close() blocks.close() env.close() print "Indexing Complete"