def addDocuments(self, dir): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) # # change to adjust performance of indexing with FSDirectory # writer.mergeFactor = writer.mergeFactor # writer.maxMergeDocs = writer.maxMergeDocs # writer.minMergeDocs = writer.minMergeDocs # for word in self.docs: doc = Document() doc.add( Field("keyword", word, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO)) doc.add( Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def do_index(): initVM() indexDir = "/home/william/woyaoo/luceneindex" version = Version.LUCENE_CURRENT standardAnalyzer = StandardAnalyzer(version) # chineseAnalyzer = CJKAnalyzer(version) engine = data.engine_from_config("indexdb.config") # engine = data.engine_from_config() db = data.init_datafactory(engine) docs = dbfactory.Session().query(doc_model.Doc).filter(doc_model.Doc.dateCreated > "20121220").all() print len(docs) idxDir = SimpleFSDirectory(File(indexDir)) perIndexCount = 5000 writer = IndexWriter(idxDir, standardAnalyzer, True, IndexWriter.MaxFieldLength(512)) # add field for doc in docs: # print repr(doc.description) lucenedoc = Document() descriptionValue = doc.description.strip("\r\n").encode("UTF-8") # descriptionValue ='中国 abc' print repr(descriptionValue) lucenedoc.add(Field("url", doc.url, Field.Store.YES, Field.Index.NOT_ANALYZED)) lucenedoc.add(Field("intent", doc.intent, Field.Store.YES, Field.Index.NOT_ANALYZED)) # lucenedoc.add(Field('description', doc.description, Field.Store.YES, Field.Index.ANALYZED)) lucenedoc.add(Field("description", descriptionValue, Field.Store.YES, Field.Index.ANALYZED)) lucenedoc.add(Field("title", doc.title, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(lucenedoc) writer.optimize() writer.close() print "index finished"
def index(string): lucene.initVM() indexDir = "REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) try: writer = IndexWriter(dir, analyzer, False, IndexWriter.MaxFieldLength(512)) except lucene.JavaError: #print 'Inside Index Except' writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) #e = sys.exc_info()[0] #print e #print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs() doc = Document() doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) #print 'In the index function' #print writer.numDocs() #print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs()) #print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs() writer.optimize() #print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs() #print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() #print 'ending Indexing' #print string #print 'Total indexes' #print writer.numDocs() writer.close()
def indexDocuments(): # empty index directory indexDir = Wikipedia.directory + 'index/' for filename in os.listdir(indexDir): os.remove(indexDir + filename) # index documents lucene.initVM() version = Version.LUCENE_CURRENT analyzer = EnglishAnalyzer(version) writer = IndexWriter(SimpleFSDirectory(File(indexDir)), analyzer, True, IndexWriter.MaxFieldLength.LIMITED) for article in Wikipedia(): doc = Document() doc.add( Field('id', str(article['id'][0]), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('title', article['url'], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('content', article['text'], Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) print 'Optimization' writer.optimize() writer.close()
class BuildIndex: def __init__(self, indir): lucene.initVM() indexdir = SimpleFSDirectory(File(indir)) self.index_writer = IndexWriter(indexdir, self.getAnalyzer(), True, IndexWriter.MaxFieldLength(512)) def getAnalyzer(self): return ChineseAnalyzer(lucene.Version.LUCENE_CURRENT) def addDocuments(self, _id, title, content): doc = Document() doc.add(Field("id", _id, Field.Store.YES, Field.Index.NOT_ANALYZED)) if title is not None and len(title) > 0: doc.add( Field("titleKeyword", title, Field.Store.NO, Field.Index.ANALYZED)) if content is not None and len(content) > 0: doc.add( Field("contentKeyword", content, Field.Store.NO, Field.Index.ANALYZED)) self.index_writer.addDocument(doc) def close(self): self.index_writer.optimize() self.index_writer.close()
def index( self ): lucene.initVM() indexdir = SimpleFSDirectory( File( self.INDEX_DIR ) ) analyzer = StandardAnalyzer( Version.LUCENE_30 ) index_writer = IndexWriter( indexdir, analyzer, True, IndexWriter.MaxFieldLength( 512 ) ) # read input files (.xml) for in_file in glob.glob( os.path.join( self.DOC_DIR, '*.xml' ) ): corpus = codecs.open( in_file, encoding='utf-8' ).read() d = pq( corpus, parser='html' ) for text in d( 'Article' ).items(): document = Document() # find ID art_id = str( text.attr( 'articleid' ).encode( 'utf-8' ) ).replace( '+', '-' ) # find Title art_title = self.stem( str( text.attr( 'title' ).encode( 'utf-8' ) ) ) # find Abstract art_abstract = self.stem( str( text.find( 'Abstract' ).html().encode('utf-8') ) ) # find Keyword art_keyword = text.find( 'Keyword' ).html().encode('utf-8') # find Content art_content = self.stem( str( text.find( 'Content' ).html().encode('utf-8') ) ) # find Authors art_authors = text.find( 'Authors' ).html().encode('utf-8') document.add( Field( 'id', art_id, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'title', art_title, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'abstract', art_abstract, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'keyword', art_keyword, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'content', art_content, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'authors', art_authors, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'article', art_title + art_abstract + art_keyword + art_content,\ Field.Store.YES,\ Field.Index.ANALYZED ) ) index_writer.addDocument( document ) index_writer.optimize() index_writer.close()
def update_index_withLineArray(self,array): """ Parsed sentences (given in an array) are added to the index, with the corresponding two entities (x,y) and the DBpedia URI """ print "start adding sentences" writer = IndexWriter(index_directory, analyzer, False, IndexWriter.MaxFieldLength(512)) for item in array: line = item[0] x = item[1] y = item[2] uri = item[3] line=line.replace("\t"," ") line = line.replace("\n"," ") line = line.replace(" "," ") try: doc = Document() doc.add(Field("Sentence", line, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("X", x, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("Y", y, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("URI", uri, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) except Exception: print "Unexpected error:", sys.exc_info()[0] raw_input("Error in updating the Sentences") try: writer.optimize() except: print "Unexpected error:", sys.exc_info()[0] print ("could not optimize index") writer.close() print "all sentences added"
def index(self,path_to_index,path_files): 'indexes anchor texts from a given folder' #lucene.initVM() indexDir = path_to_index directory_index = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_35) writer = IndexWriter(directory_index, analyzer, True, IndexWriter.MaxFieldLength(512)) listOfPathes = [] listOfPathes.extend(glob.glob(path_files+"*.txt")) counter = 0 for path_to_file in listOfPathes: print path_to_file f = open(path_to_file,"r") for line in f: entry = line.split("\t") counter+=1 """ optimizes index after a certain amount of added documents """ if counter%500000==0: print counter writer.optimize() doc = Document() doc.add(Field("anchor", entry[0], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("anchor_uri", entry[1], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("dbpedia_uri", entry[2], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("number", entry[3].replace("\n",""), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() f.close() writer.close() print counter print "done"
def configure_lucene(): f = open('clique.txt','r') lucene.initVM() print 'Inside Function' #indexDir = "/tmp/luceneindex" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs() print >> sys.stderr, "Reading lines from sys.stdin..." for line in f: line = line.replace('\t','') line = line.replace('\r','') line = line.replace('\n','') line = line.replace('^','') line = line.strip() doc = Document() doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs() writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs() print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def luceneIndexer(contents): lucene.initVM() INDEXIDR= settings.INDEX_DIR indexdir= SimpleFSDirectory(File(INDEXIDR)) analyzer= StandardAnalyzer(Version.LUCENE_30) index_writer= IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in contents: print"Indexing: ", tfile document= Document() content= tfile.getvalue() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print"Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def testUpdate(self): self.assertEqual(1, self.getHitCount("city", "Amsterdam")) reader = IndexReader.open(self.dir, False) reader.deleteDocuments(Term("city", "Amsterdam")) reader.close() writer = IndexWriter(self.dir, self.getAnalyzer(), False, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", "Russia", Field.Store.YES, Field.Index.NO)) doc.add( Field("contents", "St. Petersburg has lots of bridges", Field.Store.NO, Field.Index.ANALYZED)) doc.add( Field("city", "St. Petersburg", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close() self.assertEqual(0, self.getHitCount("city", "Amsterdam")) self.assertEqual(1, self.getHitCount("city", "Petersburg"))
def configure_lucene(): f = open('clique.txt', 'r') lucene.initVM() print 'Inside Function' #indexDir = "/tmp/luceneindex" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs( ) print >> sys.stderr, "Reading lines from sys.stdin..." for line in f: line = line.replace('\t', '') line = line.replace('\r', '') line = line.replace('\n', '') line = line.replace('^', '') line = line.strip() doc = Document() doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % ( writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs( ) writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs( ) print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def addDocuments(self, dir, isCompound): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(isCompound) # change to adjust performance of indexing with FSDirectory # writer.mergeFactor = writer.mergeFactor # writer.maxMergeDocs = writer.maxMergeDocs # writer.minMergeDocs = writer.minMergeDocs for word in self.docs: doc = Document() doc.add(Field("keyword", word, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO)) doc.add(Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def createIndex(): #initialize lucene and jvm print("started indexer") lucene.initVM() indexDir = "/Tmp/REMOVEME.index-dir" #get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_30) #get index storage dir = lucene.SimpleFSDirectory(lucene.File(indexDir)) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) src_dir = 'html_files' i = 0 for l in os.listdir(src_dir): l = os.path.join(src_dir, l) with open(l, 'r') as myfile: data=myfile.read() i += 1 document, errors = parsehtml(data) doc = Document() doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def index_files (files, index_directory): lucene.initVM() d = SimpleFSDirectory(File(index_directory)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(d, analyzer, True, IndexWriter.MaxFieldLength(512)) for f in files: parse_file(f, writer) writer.optimize() writer.close()
def index_files(files, index_directory): lucene.initVM() d = SimpleFSDirectory(File(index_directory)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(d, analyzer, True, IndexWriter.MaxFieldLength(512)) for f in files: parse_file(f, writer) writer.optimize() writer.close()
def indexSingleFieldDocs(self, fields): writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) for field in fields: doc = Document() doc.add(field) writer.addDocument(doc) writer.optimize() writer.close()
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = FSDirectory.getDirectory(storeDir, True) writer = IndexWriter(store, analyzer, True) writer.setMaxFieldLength(1048576) self.indexDocs(root, writer) print 'optimizing index', writer.optimize() writer.close() print 'done'
def index(cls, indexDir, dataDir): if not (os.path.exists(dataDir) and os.path.isdir(dataDir)): raise IOError, "%s does not exist or is not a directory" % (dataDir) writer = IndexWriter(indexDir, StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(False) numIndexed = cls.indexDirectory(writer, dataDir) writer.optimize() writer.close() return numIndexed
def luceneIndexer(docdir,indir): """ IndexDocuments from a directory. Args: docdir:文档所在文件夹 indir:索引存放文件夹 Returns: 无返回值 说明: FieldType().setStored=as-is value stored in the Lucene index FieldType().setTokenized=field is analyzed using the specified Analyzer - the tokens emitted are indexed FieldType().Indexed = the text (either as-is with keyword fields, or the tokens from tokenized fields) is made searchable (aka inverted) FieldType().Vectored = term frequency per document is stored in the index in an easily retrievable fashion. """ """#类型1属性:对于需要检索,需要返回显示setStored(True) type1 = FieldType() type1.setIndexed(True) type1.setStored(True) type1.setTokenized(False) type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) #类型2属性:对于不用返回显示,但是需要进行检索的字段。这里我认为文本内容(content)是这一种的,通常例如文件的META信息。 type2 = FieldType() type2.setIndexed(True) type2.setStored(False) type2.setTokenized(True) type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)""" lucene.initVM() DIRTOINDEX= docdir INDEXIDR= indir indexdir= SimpleFSDirectory(File(INDEXIDR)) analyzer= StandardAnalyzer(Version.LUCENE_30) #用指定的语言分析器构造一个新的写索引器. index_writer= IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')): #print "Indexing: " print "Indexing:", tfile; document = Document() content = open(tfile,'r').read() #类型使用方式 #doc.add(Field("path", tfile, type1)) #文档新增字段(Field){字段名:"text",存储:“YES”,索引:"YES"} document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED)) document.add(Field("path",tfile,Field.Store.YES,Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def initDummyStore(self, directory): """Open a dummy ramdirectory for testing.""" writer = IndexWriter(directory, SimpleAnalyzer(), True) doc = Document() doc.add(Field("name", 'dummy.txt', Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add(Field("path", '/path/to/dummy.txt', Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add(Field("path", '/path/to/another/dummy.txt', Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add(Field("contents", "foo dummy bar", Field.Store.YES, Field.Index.TOKENIZED)) writer.addDocument(doc) writer.optimize() writer.close()
def createIndex(cls, dataDir, indexDir, useCompound): indexDir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter( indexDir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.UNLIMITED ) writer.setUseCompoundFile(useCompound) for dir, dirnames, filenames in os.walk(dataDir): for filename in filenames: if filename.endswith(".properties"): cls.indexFile(writer, os.path.join(dir, filename), dataDir) writer.optimize() writer.close()
def createIndex(cls, dataDir, indexDir, useCompound): indexDir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter(indexDir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(useCompound) for dir, dirnames, filenames in os.walk(dataDir): for filename in filenames: if filename.endswith('.properties'): cls.indexFile(writer, os.path.join(dir, filename), dataDir) writer.optimize() writer.close()
def main(cls, argv): if len(argv) != 3: print "Usage: T9er <WordNet index dir> <t9 index>" return for key in cls.keys: c = key[0] k = key[1:] for kc in k: cls.keyMap[kc] = c print kc, "=", c indexDir = argv[1] t9dir = argv[2] reader = IndexReader.open(indexDir) numDocs = reader.maxDoc() print "Processing", numDocs, "words" writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True) for id in xrange(reader.maxDoc()): origDoc = reader.document(id) word = origDoc.get("word") if word is None or len(word) == 0: continue newDoc = Document() newDoc.add( Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add( Field("t9", cls.t9(word), Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add( Field("length", str(len(word)), Field.Store.NO, Field.Index.UN_TOKENIZED)) writer.addDocument(newDoc) if id % 100 == 0: print "Document", id writer.optimize() writer.close() reader.close()
def lucene_index(input_folder,output_folder): ''' Indexes fresh text data using lucene 3.6. Doesn't support incremental generation of index as of now. Currently crashes on neo by running out of heap space. Arguments: Input folder for text files. output folder for index location Returns: void. The index is stored if generated. ''' # Setting up log file logging.basicConfig(file=os.path.join(output_folder,"lucene_index.log")) logging.info("Input directory for logging: "+input_folder) logging.info("Output directory of index: "+output_folder) if not os.path.isdir(output_folder): logger.debug("Making output directory for index: "+ output_folder) os.makedirs(output_folder) # Setting up lucene's heap size for index and version of indexer lucene.initVM(initialheap='1024m',maxheap='2048m') index_folder = SimpleFSDirectory(File(output_folder)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(index_folder, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) # Optimization to reduce heap space usage for generation of index. Merges buffer with # current index after 15 docs. writer.setMergeFactor(15) writer.setRAMBufferSizeMB(32.0) # Search to find the files to index files_to_index = find_files_in_folder(input_folder) for input_file in files_to_index: doc = Document() content = open(input_file, 'r').read() doc.add(Field("text", content, Field.Store.NO, Field.Index.ANALYZED)) # Do not store text.Only index. doc.add(Field("path", input_file, Field.Store.YES, Field.Index.NO)) # Store path to assist in retreiving the file writer.addDocument(doc) # Index logger.info("Indexed lines from " +input_folder+" (%d documents in index)" % (writer.numDocs())) logger.info( "About to optimize index of %d documents..." % writer.numDocs()) writer.optimize() # Compress index logger.info("...done optimizing index of %d documents" % writer.numDocs()) logger.info("Closing index of %d documents..." % writer.numDocs()) writer.close() logger.info("Closed index")
def index(self): dirPath = os.path.join(System.getProperty("java.io.tmpdir", "tmp"), "verbose-index") dir = FSDirectory.getDirectory(dirPath, True) writer = IndexWriter(dir, SimpleAnalyzer(), True) writer.setInfoStream(System.out) for i in xrange(100): doc = Document() doc.add(Field("keyword", "goober", Field.Store.YES, Field.Index.UN_TOKENIZED)) writer.addDocument(doc) writer.optimize() writer.close()
def Indexer(docdir,indir): lucene.initVM() DIRTOINDEX = docdir INDEXDIR = indir indexdir = FSDirectory(File(INDEXDIR)) analyzer = StandardAnalyzer(VERSION.LUCENE_30) index_writer = IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')): print "Indexing ",tfile document=Document() content = open(tfile,'r').read() document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done" index_writer.optimize() print index_writer.numDocs() index_writer.close()
def index(self): dirPath = os.path.join(tempfile.gettempdir(), "verbose-index") dir = FSDirectory.open(dirPath) writer = IndexWriter(dir, SimpleAnalyzer(), True) writer.setInfoStream(InfoStreamOut()) for i in xrange(100): doc = Document() doc.add(Field("keyword", "goober", Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def index(cls, indexDir, dataDir): if not (os.path.exists(dataDir) and os.path.isdir(dataDir)): raise IOError, "%s does not exist or is not a directory" %(dataDir) dir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(False) cls.indexDirectory(writer, dataDir) numIndexed = writer.numDocs() writer.optimize() writer.close() dir.close() return numIndexed
def index(string): lucene.initVM() indexDir = "REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs() doc = Document() doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs() writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs() print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def index(cls, indexDir, dataDir): if not (os.path.exists(dataDir) and os.path.isdir(dataDir)): raise IOError, "%s does not exist or is not a directory" % ( dataDir) dir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(False) cls.indexDirectory(writer, dataDir) numIndexed = writer.numDocs() writer.optimize() writer.close() dir.close() return numIndexed
def luceneIndexer(docdir,indir): """ Index Documents from a dirrcory """ lucene.initVM() DIRTOINDEX = docdir INDEXIDR = indir indexdir = SimpleFSDirectory(File(INDEXIDR)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer = IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')): print "Indexing: ", tfile document = Document() content = open(tfile,'r').read() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def luceneIndexer(docdir, indir): """ IndexDocuments from a directory """ lucene.initVM() DIRTOINDEX = docdir INDEXIDR = indir indexdir = SimpleFSDirectory(File(INDEXIDR)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer= IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')): print "Indexing: ", tfile document = Document() content = open(tfile, 'r').read() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def index(self, doc, title, department, url): indexdir = SimpleFSDirectory(File(self.indir)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer = IndexWriter(indexdir, analyzer, self.init, IndexWriter.MaxFieldLength(512)) self.init = False # Initialize document and index it document = Document() document.add( Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) document.add(Field("url", url, Field.Store.YES, Field.Index.ANALYZED)) document.add( Field("department", department, Field.Store.YES, Field.Index.ANALYZED)) document.add(Field("text", doc, Field.Store.YES, Field.Index.ANALYZED)) index_writer.addDocument(document) index_writer.optimize() index_writer.close()
def _createIndex(self, inputDF, colname): """ function to create lucene index, iterates over inputDF row by row, and indexes the relevant column By default - WhitespaceAnalyzer is used, other Analyzers are also available. """ # Create index directory directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.LIMITED) # Inline indexing of column data inputDF.apply(lambda x: self._addDoc(x[colname], writer), axis=1) # Optimize, close and return writer.optimize() writer.close() return directory
def update_index_withLine(self,line,x,y,uri): """ Parsed sentence is added to the index, with the corresponding two entities (x,y) and the DBpedia URI """ line=line.replace("\t"," ") line = line.replace("\n"," ") line = line.replace(" "," ") try: writer = IndexWriter(index_directory, analyzer, False, IndexWriter.MaxFieldLength(512)) doc = Document() doc.add(Field("Sentence", line, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("X", x, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("Y", y, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("URI", uri, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close() except Exception: print "Unexpected error:", sys.exc_info()[0] raw_input("Error in updating the Sentences")
def addDocuments(self, dir, maxFieldLength): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength(maxFieldLength)) for keyword, unindexed, unstored, text in \ izip(self.keywords, self.unindexed, self.unstored, self.text): doc = Document() doc.add(Field("id", keyword, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", unindexed, Field.Store.YES, Field.Index.NO)) doc.add(Field("contents", unstored, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("city", text, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def addDocuments(self, dir): writer = IndexWriter(dir, self.getAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(self.isCompound()) for i in xrange(len(self.keywords)): doc = Document() doc.add(Field("id", self.keywords[i], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", self.unindexed[i], Field.Store.YES, Field.Index.NO)) doc.add(Field("contents", self.unstored[i], Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("city", self.text[i], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def main(cls, argv): if len(argv) != 3: print "Usage: T9er <WordNet index dir> <t9 index>" return for key in cls.keys: c = key[0] k = key[1:] for kc in k: cls.keyMap[kc] = c print kc, "=", c indexDir = argv[1] t9dir = argv[2] reader = IndexReader.open(indexDir) numDocs = reader.maxDoc() print "Processing", numDocs, "words" writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True) for id in xrange(reader.maxDoc()): origDoc = reader.document(id) word = origDoc.get("word") if word is None or len(word) == 0: continue newDoc = Document() newDoc.add(Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add(Field("t9", cls.t9(word), Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add(Field("length", str(len(word)), Field.Store.NO, Field.Index.UN_TOKENIZED)) writer.addDocument(newDoc) if id % 100 == 0: print "Document", id writer.optimize() writer.close() reader.close()
def indexDocuments(): # empty index directory indexDir = Wikipedia.directory + 'index/' for filename in os.listdir(indexDir): os.remove(indexDir + filename) # index documents lucene.initVM() version = Version.LUCENE_CURRENT analyzer = EnglishAnalyzer(version) writer = IndexWriter(SimpleFSDirectory(File(indexDir)), analyzer, True, IndexWriter.MaxFieldLength.LIMITED) for article in Wikipedia(): doc = Document() doc.add(Field('id', str(article['id'][0]), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field('title', article['url'], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field('content', article['text'], Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) print 'Optimization' writer.optimize() writer.close()
def testDeleteAfterIndexMerge(self): reader = IndexReader.open(self.dir, False) self.assertEqual(2, reader.maxDoc()) self.assertEqual(2, reader.numDocs()) reader.deleteDocument(1) reader.close() writer = IndexWriter(self.dir, self.getAnalyzer(), False, IndexWriter.MaxFieldLength.UNLIMITED) writer.optimize() writer.close() reader = IndexReader.open(self.dir, True) self.assert_(not reader.isDeleted(1)) self.assert_(not reader.hasDeletions()) self.assertEqual(1, reader.maxDoc()) self.assertEqual(1, reader.numDocs()) reader.close()
class IndexCorpus(object): def __init__(self, index_dir, analyzer, compress=False): self.metadata = True if not os.path.exists(index_dir): os.mkdir(index_dir) self.compress = compress store = SimpleFSDirectory(File(index_dir)) self.writer = IndexWriter(store, analyzer, True, IndexWriter.MaxFieldLength.LIMITED) self.writer.setMaxFieldLength(1048576) if self.compress: self.compressor = self.get_compressor() def get_compressor(self): path = '/Users/tal/corpus/analyzed/5/5344' training_data = codecs.open(path, encoding='utf8').read() return trained_short_string_compressor(training_data.encode('utf8')) def finalize(self): self.writer.optimize() self.writer.close() def index(self, directory): files = [x for x in os.listdir(directory) if x.isdigit()] for filename in sorted(files, key=int): path = os.path.join(directory, filename) if not filename.isdigit(): continue if os.path.isdir(path): self.index(path) else: if int(filename) % 100 == 0: print datetime.now().ctime(), filename try: self.index_file(path) except Exception, e: print "Indexing exception:", e
def testUpdate(self): self.assertEqual(1, self.getHitCount("city", "Amsterdam")) reader = IndexReader.open(self.dir, False) reader.deleteDocuments(Term("city", "Amsterdam")) reader.close() writer = IndexWriter(self.dir, self.getAnalyzer(), False, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", "Russia", Field.Store.YES, Field.Index.NO)) doc.add(Field("contents", "St. Petersburg has lots of bridges", Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("city", "St. Petersburg", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close() self.assertEqual(0, self.getHitCount("city", "Amsterdam")) self.assertEqual(1, self.getHitCount("city", "Petersburg"))
def luceneIndexer(docdir, indir): """frpFile IndexDocuments from a directory para:{ docdir: the path of the txt file indir: the path of the index file which is generated by the following code } """ lucene.initVM() DIRTOINDEX = docdir INDEXIDR = indir indexdir = SimpleFSDirectory(File(INDEXIDR)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer = IndexWriter(indexdir, analyzer, True, \ IndexWriter.MaxFieldLength(512)) #for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')): list = os.listdir(DIRTOINDEX) for i in range(len(list)): tfile = os.path.join(DIRTOINDEX, list[i]) if os.path.isfile(tfile): print ("Indexing: ", tfile) print ('okokokook') document = Document() content = open(tfile, 'r').read() document.add(Field("text", content, Field.Store.YES, \ Field.Index.ANALYZED)) document.add(Field("title",str(tfile.strip('.txt')),Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) #print (document) print ("Done: ", tfile) index_writer.optimize() print (index_writer.numDocs()) index_writer.close()
def index(self,path_to_index,sentencearray): 'indexes wikipedia sentences' lucene.initVM() indexDir = path_to_index dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_35) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) counter = 0 for file_name in sentencearray: print file_name f = open(file_name,"r") for line in f: counter += 1 line = line.replace("\n","") if "\t" in line: tmp = line.split("\t") else: tmp = [line,"0"] doc = Document() #print("sentence", tmp[0]) #print ("key", tmp[1]) #print doc.add(Field("sentence", tmp[0], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("key", tmp[1], Field.Store.YES, Field.Index.ANALYZED)) # doc.add(IntField("key", tmp[1], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) if counter%1000000 == 0: writer.optimize() print counter writer.optimize() f.close() writer.close() print "Done" print counter
def addDocuments(self, dir): writer = IndexWriter(dir, self.getAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(self.isCompound()) for i in xrange(len(self.keywords)): doc = Document() doc.add( Field("id", self.keywords[i], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("country", self.unindexed[i], Field.Store.YES, Field.Index.NO)) doc.add( Field("contents", self.unstored[i], Field.Store.NO, Field.Index.ANALYZED)) doc.add( Field("city", self.text[i], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def main(cls, argv): if len(argv) < 2: print "Usage: BerkeleyDbIndexer <index dir> -create" return dbHome = argv[1] create = len(argv) > 2 and argv[2] == "-create" if not os.path.exists(dbHome): os.makedirs(dbHome) elif create: for name in os.listdir(dbHome): if name.startswith('__'): os.remove(os.path.join(dbHome, name)) env = DBEnv() env.set_flags(DB_LOG_INMEMORY, 1) if os.name == 'nt': env.set_cachesize(0, 0x4000000, 1) elif os.name == 'posix': from commands import getstatusoutput if getstatusoutput('uname') == (0, 'Linux'): env.set_cachesize(0, 0x4000000, 1) env.open(dbHome, (DB_CREATE | DB_THREAD | DB_INIT_MPOOL | DB_INIT_LOCK | DB_INIT_TXN), 0) index = DB(env) blocks = DB(env) txn = None try: txn = env.txn_begin(None) index.open(filename='__index__', dbtype=DB_BTREE, flags=DB_CREATE | DB_THREAD, txn=txn) blocks.open(filename='__blocks__', dbtype=DB_BTREE, flags=DB_CREATE | DB_THREAD, txn=txn) except: if txn is not None: txn.abort() txn = None raise else: txn.commit() txn = None try: txn = env.txn_begin(None) directory = DbDirectory(txn, index, blocks, 0) writer = IndexWriter(directory, StandardAnalyzer(), create, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(False) doc = Document() doc.add( Field("contents", "The quick brown fox...", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close() except: if txn is not None: txn.abort() txn = None raise else: txn.commit() index.close() blocks.close() env.close() print "Indexing Complete"
return StopFilter(LowerCaseFilter(LetterTokenizer(reader)), self.stopWords) if __name__ == '__main__': analyzer = CJKAnalyzer() directory = RAMDirectory() ireader = IndexReader.open(STORE_DIR) iwriter = IndexWriter(directory, StandardAnalyzer(), True) ts = ["javasd。 $##open所大家教唆犯地方地方即可解放大家空间艰苦奋斗矿井口地方", "所看看对抗赛不久交会法觉得拮抗剂"] for t in ts: doc = Document() doc.add(Field("fieldname", t, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) iwriter.addDocument(doc) iwriter.optimize() iwriter.close() ireader = IndexReader.open(directory) tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'fieldname')) for (t, f, i) in zip(tpv.getTerms(), tpv.getTermFrequencies(), xrange(100000)): print 'term %s' % t print ' freq: %i' % f try: print ' pos: ' + str([p for p in tpv.getTermPositions(i)]) except: print ' no pos' try: print ' off: ' + \ str(["%i-%i" % (o.getStartOffset(), o.getEndOffset()) for o in tpv.getOffsets(i)])
doc.add( Field("keywords", ' '.join((command, name, synopsis, description)), Field.Store.NO, Field.Index.ANALYZED)) doc.add( Field("filename", os.path.abspath(path), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) if __name__ == '__main__': if len(sys.argv) != 2: print "Usage: python manindex.py <index dir>" else: initVM() indexDir = sys.argv[1] writer = IndexWriter(SimpleFSDirectory(File(indexDir)), StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) manpath = os.environ.get('MANPATH', '/usr/share/man').split(os.pathsep) for dir in manpath: print "Crawling", dir for name in os.listdir(dir): path = os.path.join(dir, name) if os.path.isdir(path): indexDirectory(path) writer.optimize() writer.close()