def index(source, indexName): if(not os.path.exists(indexName)): os.mkdir(indexName) indexDir = File(indexName) writer = IndexWriter(SimpleFSDirectory(File(indexName)),StandardAnalyzer(Version.LUCENE_CURRENT), True,IndexWriter.MaxFieldLength.LIMITED) p = re.compile("(GH\d+\-\d+)\n(.*?)\n+", re.DOTALL) res = p.findall(source) i = 0 for pair in res: i += 1 doc = Document() doc.add(Field("id", pair[0], Field.Store.YES, Field.Index.NO)) for t in pair[1].split(): doc.add(Field("content", t.replace("-","_"), Field.Store.NO, Field.Index.NOT_ANALYZED)); #doc.add(Field("content", pair[1], Field.Store.NO, Field.Index.ANALYZED)); writer.addDocument(doc) writer.close() print str(i)+ " docs indexed"
def setUp(self): animals = [ "aardvark", "beaver", "coati", "dog", "elephant", "frog", "gila monster", "horse", "iguana", "javelina", "kangaroo", "lemur", "moose", "nematode", "orca", "python", "quokka", "rat", "scorpion", "tarantula", "uromastyx", "vicuna", "walrus", "xiphias", "yak", "zebra" ] analyzer = WhitespaceAnalyzer() aTOmDirectory = RAMDirectory() nTOzDirectory = RAMDirectory() aTOmWriter = IndexWriter(aTOmDirectory, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) nTOzWriter = IndexWriter(nTOzDirectory, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) for animal in animals: doc = Document() doc.add(Field("animal", animal, Field.Store.YES, Field.Index.NOT_ANALYZED)) if animal[0].lower() < "n": aTOmWriter.addDocument(doc) else: nTOzWriter.addDocument(doc) aTOmWriter.close() nTOzWriter.close() self.searchers = [ IndexSearcher(aTOmDirectory), IndexSearcher(nTOzDirectory) ]
def open(self, name, txn, **kwds): super(IndexContainer, self).open(name, txn, **kwds) if kwds.get('create', False): directory = self.getDirectory() indexWriter = IndexWriter(directory, StandardAnalyzer(), True) indexWriter.close() directory.close()
def index_files (files, index_directory): lucene.initVM() d = SimpleFSDirectory(File(index_directory)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(d, analyzer, True, IndexWriter.MaxFieldLength(512)) for f in files: parse_file(f, writer) writer.optimize() writer.close()
def index(self,path_to_index,path_files): 'indexes anchor texts from a given folder' #lucene.initVM() indexDir = path_to_index directory_index = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_35) writer = IndexWriter(directory_index, analyzer, True, IndexWriter.MaxFieldLength(512)) listOfPathes = [] listOfPathes.extend(glob.glob(path_files+"*.txt")) counter = 0 for path_to_file in listOfPathes: print path_to_file f = open(path_to_file,"r") for line in f: entry = line.split("\t") counter+=1 """ optimizes index after a certain amount of added documents """ if counter%500000==0: print counter writer.optimize() doc = Document() doc.add(Field("anchor", entry[0], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("anchor_uri", entry[1], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("dbpedia_uri", entry[2], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("number", entry[3].replace("\n",""), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() f.close() writer.close() print counter print "done"
def createIndex(): #initialize lucene and jvm print("started indexer") lucene.initVM() indexDir = "/Tmp/REMOVEME.index-dir" #get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_30) #get index storage dir = lucene.SimpleFSDirectory(lucene.File(indexDir)) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) src_dir = 'html_files' i = 0 for l in os.listdir(src_dir): l = os.path.join(src_dir, l) with open(l, 'r') as myfile: data=myfile.read() i += 1 document, errors = parsehtml(data) doc = Document() doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def addDocuments(self, dir, isCompound): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(isCompound) # change to adjust performance of indexing with FSDirectory # writer.mergeFactor = writer.mergeFactor # writer.maxMergeDocs = writer.maxMergeDocs # writer.minMergeDocs = writer.minMergeDocs for word in self.docs: doc = Document() doc.add(Field("keyword", word, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO)) doc.add(Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def addContents(self,contents): try: #iwconfig = IndexWriterConfig(SimpleAnalyzer(),IndexWriter.MaxFieldLength.LIMITED) writer = IndexWriter(self.ramIndex,SimpleAnalyzer(Version.LUCENE_CURRENT),True,IndexWriter.MaxFieldLength.LIMITED) for content in contents: doc = Document() doc.add(Field("contents",content[1],Field.Store.NO,Field.Index.ANALYZED,Field.TermVector.YES)) writer.addDocument(doc) writer.close() except Exception,e: print 'Unable to add content to RAM index'
def setUp(self): # set up sample document directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(directory)
def does_index_exists(self,path_to_index): """ Checks if Index already exists, returns True or False """ try: index_directory = SimpleFSDirectory(File(path_to_index)) analyzer = StandardAnalyzer(Version.LUCENE_35) writer = IndexWriter(index_directory, analyzer, False, IndexWriter.MaxFieldLength(512)) writer.close() print path_to_index+" exists" return True except: return False
def addDocuments(self, dir): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) # # change to adjust performance of indexing with FSDirectory # writer.mergeFactor = writer.mergeFactor # writer.maxMergeDocs = writer.maxMergeDocs # writer.minMergeDocs = writer.minMergeDocs # for word in self.docs: doc = Document() doc.add( Field("keyword", word, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO)) doc.add( Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def index(string): lucene.initVM() indexDir = "REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) try: writer = IndexWriter(dir, analyzer, False, IndexWriter.MaxFieldLength(512)) except lucene.JavaError: #print 'Inside Index Except' writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) #e = sys.exc_info()[0] #print e #print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs() doc = Document() doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) #print 'In the index function' #print writer.numDocs() #print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs()) #print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs() writer.optimize() #print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs() #print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() #print 'ending Indexing' #print string #print 'Total indexes' #print writer.numDocs() writer.close()
def testUpdate(self): self.assertEqual(1, self.getHitCount("city", "Amsterdam")) reader = IndexReader.open(self.dir, False) reader.deleteDocuments(Term("city", "Amsterdam")) reader.close() writer = IndexWriter(self.dir, self.getAnalyzer(), False, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", "Russia", Field.Store.YES, Field.Index.NO)) doc.add( Field("contents", "St. Petersburg has lots of bridges", Field.Store.NO, Field.Index.ANALYZED)) doc.add( Field("city", "St. Petersburg", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close() self.assertEqual(0, self.getHitCount("city", "Amsterdam")) self.assertEqual(1, self.getHitCount("city", "Petersburg"))
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) # Elwood document = Document() document.add( Field("owner", "elwood", Field.Store.YES, Field.Index.NOT_ANALYZED)) document.add( Field("keywords", "elwoods sensitive info", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(document) # Jake document = Document() document.add( Field("owner", "jake", Field.Store.YES, Field.Index.NOT_ANALYZED)) document.add( Field("keywords", "jakes sensitive info", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(document) writer.close()
def indexDocuments(): # empty index directory indexDir = Wikipedia.directory + 'index/' for filename in os.listdir(indexDir): os.remove(indexDir + filename) # index documents lucene.initVM() version = Version.LUCENE_CURRENT analyzer = EnglishAnalyzer(version) writer = IndexWriter(SimpleFSDirectory(File(indexDir)), analyzer, True, IndexWriter.MaxFieldLength.LIMITED) for article in Wikipedia(): doc = Document() doc.add( Field('id', str(article['id'][0]), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('title', article['url'], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('content', article['text'], Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) print 'Optimization' writer.optimize() writer.close()
def setUp(self): self.directory = RAMDirectory() self.analyzer = WhitespaceAnalyzer() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) doc = Document() doc.add( Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True) self.reader = IndexReader.open(self.directory, True) self.quick = SpanTermQuery(Term("f", "quick")) self.brown = SpanTermQuery(Term("f", "brown")) self.red = SpanTermQuery(Term("f", "red")) self.fox = SpanTermQuery(Term("f", "fox")) self.lazy = SpanTermQuery(Term("f", "lazy")) self.sleepy = SpanTermQuery(Term("f", "sleepy")) self.dog = SpanTermQuery(Term("f", "dog")) self.cat = SpanTermQuery(Term("f", "cat"))
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("partnum", "Q36", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("description", "Illidium Space Modulator", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = self.getSearcher()
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) self.addPoint(writer, "El Charro", "restaurant", 1, 2) self.addPoint(writer, "Cafe Poca Cosa", "restaurant", 5, 9) self.addPoint(writer, "Los Betos", "restaurant", 9, 6) self.addPoint(writer, "Nico's Taco Shop", "restaurant", 3, 8) writer.close() self.searcher = IndexSearcher(self.directory, True) self.query = TermQuery(Term("type", "restaurant"))
def create_index(self,path_to_index): """ Creates new Index """ print "Create new Index" path = SimpleFSDirectory(File(path_to_index)) analyzer = StandardAnalyzer(Version.LUCENE_35) writer = IndexWriter(path, analyzer, True, IndexWriter.MaxFieldLength(512)) doc = Document() doc.add(Field("Sentence", "Hello World", Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("X", "x", Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("Y", "y", Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("URI", "uri", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close()
def testWriteLock(self): writer1 = IndexWriter(self.dir, SimpleAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED) writer2 = None try: try: writer2 = IndexWriter(self.dir, SimpleAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED) self.fail("We should never reach this point") except: pass finally: writer1.close() self.assert_(writer2 is None)
def index(self): dirPath = os.path.join(System.getProperty("java.io.tmpdir", "tmp"), "verbose-index") dir = FSDirectory.getDirectory(dirPath, True) writer = IndexWriter(dir, SimpleAnalyzer(), True) writer.setInfoStream(System.out) for i in xrange(100): doc = Document() doc.add(Field("keyword", "goober", Field.Store.YES, Field.Index.UN_TOKENIZED)) writer.addDocument(doc) writer.optimize() writer.close()
def __enter__(self): self.searcher = CrowdSearcher() mkdir = not os.path.exists(settings.lucene_index_dir) self.writer = IndexWriter(self.searcher.index, self.searcher.analyzer, mkdir, IndexWriter.MaxFieldLength.UNLIMITED) return self
def luceneIndexer(docdir,indir): """ IndexDocuments from a directory. Args: docdir:文档所在文件夹 indir:索引存放文件夹 Returns: 无返回值 说明: FieldType().setStored=as-is value stored in the Lucene index FieldType().setTokenized=field is analyzed using the specified Analyzer - the tokens emitted are indexed FieldType().Indexed = the text (either as-is with keyword fields, or the tokens from tokenized fields) is made searchable (aka inverted) FieldType().Vectored = term frequency per document is stored in the index in an easily retrievable fashion. """ """#类型1属性:对于需要检索,需要返回显示setStored(True) type1 = FieldType() type1.setIndexed(True) type1.setStored(True) type1.setTokenized(False) type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) #类型2属性:对于不用返回显示,但是需要进行检索的字段。这里我认为文本内容(content)是这一种的,通常例如文件的META信息。 type2 = FieldType() type2.setIndexed(True) type2.setStored(False) type2.setTokenized(True) type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)""" lucene.initVM() DIRTOINDEX= docdir INDEXIDR= indir indexdir= SimpleFSDirectory(File(INDEXIDR)) analyzer= StandardAnalyzer(Version.LUCENE_30) #用指定的语言分析器构造一个新的写索引器. index_writer= IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')): #print "Indexing: " print "Indexing:", tfile; document = Document() content = open(tfile,'r').read() #类型使用方式 #doc.add(Field("path", tfile, type1)) #文档新增字段(Field){字段名:"text",存储:“YES”,索引:"YES"} document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED)) document.add(Field("path",tfile,Field.Store.YES,Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def do_index(): initVM() indexDir = "/home/william/woyaoo/luceneindex" version = Version.LUCENE_CURRENT standardAnalyzer = StandardAnalyzer(version) # chineseAnalyzer = CJKAnalyzer(version) engine = data.engine_from_config("indexdb.config") # engine = data.engine_from_config() db = data.init_datafactory(engine) docs = dbfactory.Session().query(doc_model.Doc).filter(doc_model.Doc.dateCreated > "20121220").all() print len(docs) idxDir = SimpleFSDirectory(File(indexDir)) perIndexCount = 5000 writer = IndexWriter(idxDir, standardAnalyzer, True, IndexWriter.MaxFieldLength(512)) # add field for doc in docs: # print repr(doc.description) lucenedoc = Document() descriptionValue = doc.description.strip("\r\n").encode("UTF-8") # descriptionValue ='中国 abc' print repr(descriptionValue) lucenedoc.add(Field("url", doc.url, Field.Store.YES, Field.Index.NOT_ANALYZED)) lucenedoc.add(Field("intent", doc.intent, Field.Store.YES, Field.Index.NOT_ANALYZED)) # lucenedoc.add(Field('description', doc.description, Field.Store.YES, Field.Index.ANALYZED)) lucenedoc.add(Field("description", descriptionValue, Field.Store.YES, Field.Index.ANALYZED)) lucenedoc.add(Field("title", doc.title, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(lucenedoc) writer.optimize() writer.close() print "index finished"
def update_index_withLineArray(self,array): """ Parsed sentences (given in an array) are added to the index, with the corresponding two entities (x,y) and the DBpedia URI """ print "start adding sentences" writer = IndexWriter(index_directory, analyzer, False, IndexWriter.MaxFieldLength(512)) for item in array: line = item[0] x = item[1] y = item[2] uri = item[3] line=line.replace("\t"," ") line = line.replace("\n"," ") line = line.replace(" "," ") try: doc = Document() doc.add(Field("Sentence", line, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("X", x, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("Y", y, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("URI", uri, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) except Exception: print "Unexpected error:", sys.exc_info()[0] raw_input("Error in updating the Sentences") try: writer.optimize() except: print "Unexpected error:", sys.exc_info()[0] print ("could not optimize index") writer.close() print "all sentences added"
def index( self ): lucene.initVM() indexdir = SimpleFSDirectory( File( self.INDEX_DIR ) ) analyzer = StandardAnalyzer( Version.LUCENE_30 ) index_writer = IndexWriter( indexdir, analyzer, True, IndexWriter.MaxFieldLength( 512 ) ) # read input files (.xml) for in_file in glob.glob( os.path.join( self.DOC_DIR, '*.xml' ) ): corpus = codecs.open( in_file, encoding='utf-8' ).read() d = pq( corpus, parser='html' ) for text in d( 'Article' ).items(): document = Document() # find ID art_id = str( text.attr( 'articleid' ).encode( 'utf-8' ) ).replace( '+', '-' ) # find Title art_title = self.stem( str( text.attr( 'title' ).encode( 'utf-8' ) ) ) # find Abstract art_abstract = self.stem( str( text.find( 'Abstract' ).html().encode('utf-8') ) ) # find Keyword art_keyword = text.find( 'Keyword' ).html().encode('utf-8') # find Content art_content = self.stem( str( text.find( 'Content' ).html().encode('utf-8') ) ) # find Authors art_authors = text.find( 'Authors' ).html().encode('utf-8') document.add( Field( 'id', art_id, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'title', art_title, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'abstract', art_abstract, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'keyword', art_keyword, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'content', art_content, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'authors', art_authors, Field.Store.YES, Field.Index.ANALYZED ) ) document.add( Field( 'article', art_title + art_abstract + art_keyword + art_content,\ Field.Store.YES,\ Field.Index.ANALYZED ) ) index_writer.addDocument( document ) index_writer.optimize() index_writer.close()
def setUp(self): self.directory = RAMDirectory() self.analyzer = WhitespaceAnalyzer() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) doc = Document() doc.add(Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True) self.reader = IndexReader.open(self.directory, True) self.quick = SpanTermQuery(Term("f", "quick")) self.brown = SpanTermQuery(Term("f", "brown")) self.red = SpanTermQuery(Term("f", "red")) self.fox = SpanTermQuery(Term("f", "fox")) self.lazy = SpanTermQuery(Term("f", "lazy")) self.sleepy = SpanTermQuery(Term("f", "sleepy")) self.dog = SpanTermQuery(Term("f", "dog")) self.cat = SpanTermQuery(Term("f", "cat"))
class CrowdIndexer(): def __enter__(self): self.searcher = CrowdSearcher() mkdir = not os.path.exists(settings.lucene_index_dir) self.writer = IndexWriter(self.searcher.index, self.searcher.analyzer, mkdir, IndexWriter.MaxFieldLength.UNLIMITED) return self def addCrowd(self,id,text): doc = Document(); doc.add(Field(CrowdFields.id, id, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field(CrowdFields.text, text, Field.Store.YES, Field.Index.ANALYZED)) self.writer.updateDocument(Term(CrowdFields.id, id), doc) def __exit__(self, type, value, traecback): self.writer.close()
def commitIndexWriter(self, writer): directory = writer.getDirectory() writer.close() dbDirectory = self.getDirectory() dbWriter = IndexWriter(dbDirectory, StandardAnalyzer(), False) dbWriter.setUseCompoundFile(False) dbWriter.addIndexes([directory]) directory.close() dbWriter.close() dbDirectory.close()
def indexSingleFieldDocs(self, fields): writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) for field in fields: doc = Document() doc.add(field) writer.addDocument(doc) writer.optimize() writer.close()
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = FSDirectory.getDirectory(storeDir, True) writer = IndexWriter(store, analyzer, True) writer.setMaxFieldLength(1048576) self.indexDocs(root, writer) print 'optimizing index', writer.optimize() writer.close() print 'done'
def index_files(files, index_directory): lucene.initVM() d = SimpleFSDirectory(File(index_directory)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(d, analyzer, True, IndexWriter.MaxFieldLength(512)) for f in files: parse_file(f, writer) writer.optimize() writer.close()
class CrowdIndexer(): def __enter__(self): self.searcher = CrowdSearcher() mkdir = not os.path.exists(settings.lucene_index_dir) self.writer = IndexWriter(self.searcher.index, self.searcher.analyzer, mkdir, IndexWriter.MaxFieldLength.UNLIMITED) return self def addCrowd(self, id, text): doc = Document() doc.add( Field(CrowdFields.id, id, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field(CrowdFields.text, text, Field.Store.YES, Field.Index.ANALYZED)) self.writer.updateDocument(Term(CrowdFields.id, id), doc) def __exit__(self, type, value, traecback): self.writer.close()
def someMethod(self): directory = RAMDirectory() analyzer = StandardAnalyzer() writer = IndexWriter(directory, analyzer, True) doc = Document() doc.add(Field.Text("title", "This is the title")) doc.add(Field.UnStored("contents", "...document contents...")) writer.addDocument(doc) writer.addDocument(doc, analyzer) expression = "some query" query = QueryParser.parse(expression, "contents", analyzer) parser = QueryParser("contents", analyzer) query = parser.parseQuery(expression)
def __init__(self, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(writer) ticker = Ticker() print( 'optimizing index' ), threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print( 'done' )
def Indexer(docdir,indir): lucene.initVM() DIRTOINDEX = docdir INDEXDIR = indir indexdir = FSDirectory(File(INDEXDIR)) analyzer = StandardAnalyzer(VERSION.LUCENE_30) index_writer = IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')): print "Indexing ",tfile document=Document() content = open(tfile,'r').read() document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done" index_writer.optimize() print index_writer.numDocs() index_writer.close()
def index(self): dirPath = os.path.join(tempfile.gettempdir(), "verbose-index") dir = FSDirectory.open(dirPath) writer = IndexWriter(dir, SimpleAnalyzer(), True) writer.setInfoStream(InfoStreamOut()) for i in xrange(100): doc = Document() doc.add(Field("keyword", "goober", Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def Index(): field_list, conn, _config_dict = _InitIndexer() indexDir = _config_dict["indexDir"] if not os.path.exists(indexDir): os.mkdir(indexDir) store = SimpleFSDirectory(lucene.File(indexDir)) # print store writer = IndexWriter( store, SmartChineseAnalyzer(lucene.Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED ) writer.setMaxFieldLength(1048576) try: ticker = Ticker() ticker.start() _IndexDocs(writer, field_list, conn) ticker.end() ticker.TimeCost() except Exception, e: print "Failed in Indexing...", e traceback.print_exc()
def Index(): field_list, conn, _config_dict = _InitIndexer() indexDir = _config_dict['indexDir'] if not os.path.exists(indexDir): os.mkdir(indexDir) store = SimpleFSDirectory(lucene.File(indexDir)) #print store writer = IndexWriter(store, SmartChineseAnalyzer(lucene.Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) writer.setMaxFieldLength(1048576) try: ticker = Ticker() ticker.start() _IndexDocs(writer, field_list, conn) ticker.end() ticker.TimeCost() except Exception, e: print "Failed in Indexing...", e traceback.print_exc()
class BuildIndex: def __init__(self, indir): lucene.initVM() indexdir = SimpleFSDirectory(File(indir)) self.index_writer = IndexWriter(indexdir, self.getAnalyzer(), True, IndexWriter.MaxFieldLength(512)) def getAnalyzer(self): return ChineseAnalyzer(lucene.Version.LUCENE_CURRENT) def addDocuments(self, _id, title, content): doc = Document() doc.add(Field("id", _id, Field.Store.YES, Field.Index.NOT_ANALYZED)) if title is not None and len(title) > 0: doc.add( Field("titleKeyword", title, Field.Store.NO, Field.Index.ANALYZED)) if content is not None and len(content) > 0: doc.add( Field("contentKeyword", content, Field.Store.NO, Field.Index.ANALYZED)) self.index_writer.addDocument(doc) def close(self): self.index_writer.optimize() self.index_writer.close()
def testDeleteAfterIndexMerge(self): reader = IndexReader.open(self.dir, False) self.assertEqual(2, reader.maxDoc()) self.assertEqual(2, reader.numDocs()) reader.deleteDocument(1) reader.close() writer = IndexWriter(self.dir, self.getAnalyzer(), False, IndexWriter.MaxFieldLength.UNLIMITED) writer.commit() writer.close() reader = IndexReader.open(self.dir, True) deletedDocs = MultiFields.getDeletedDocs(reader) self.assert_(deletedDocs is None or not deletedDocs.get(1)) self.assert_(not reader.hasDeletions()) self.assertEqual(1, reader.maxDoc()) self.assertEqual(1, reader.numDocs()) reader.close()
def createIndex(cls, dataDir, indexDir, useCompound): indexDir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter(indexDir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(useCompound) for dir, dirnames, filenames in os.walk(dataDir): for filename in filenames: if filename.endswith('.properties'): cls.indexFile(writer, os.path.join(dir, filename), dataDir) writer.optimize() writer.close()
def index(cls, indexDir, dataDir): if not (os.path.exists(dataDir) and os.path.isdir(dataDir)): raise IOError, "%s does not exist or is not a directory" % ( dataDir) dir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(False) cls.indexDirectory(writer, dataDir) numIndexed = writer.numDocs() writer.optimize() writer.close() dir.close() return numIndexed
def addDocuments(self, dir, maxFieldLength): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength(maxFieldLength)) for keyword, unindexed, unstored, text in \ izip(self.keywords, self.unindexed, self.unstored, self.text): doc = Document() doc.add(Field("id", keyword, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", unindexed, Field.Store.YES, Field.Index.NO)) doc.add(Field("contents", unstored, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("city", text, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def index(self, doc, title, department, url): indexdir = SimpleFSDirectory(File(self.indir)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer = IndexWriter(indexdir, analyzer, self.init, IndexWriter.MaxFieldLength(512)) self.init = False # Initialize document and index it document = Document() document.add( Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) document.add(Field("url", url, Field.Store.YES, Field.Index.ANALYZED)) document.add( Field("department", department, Field.Store.YES, Field.Index.ANALYZED)) document.add(Field("text", doc, Field.Store.YES, Field.Index.ANALYZED)) index_writer.addDocument(document) index_writer.optimize() index_writer.close()
def addDocuments(self, dir): writer = IndexWriter(dir, self.getAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(self.isCompound()) for i in xrange(len(self.keywords)): doc = Document() doc.add( Field("id", self.keywords[i], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("country", self.unindexed[i], Field.Store.YES, Field.Index.NO)) doc.add( Field("contents", self.unstored[i], Field.Store.NO, Field.Index.ANALYZED)) doc.add( Field("city", self.text[i], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def luceneIndexer(docdir, indir): """frpFile IndexDocuments from a directory para:{ docdir: the path of the txt file indir: the path of the index file which is generated by the following code } """ lucene.initVM() DIRTOINDEX = docdir INDEXIDR = indir indexdir = SimpleFSDirectory(File(INDEXIDR)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer = IndexWriter(indexdir, analyzer, True, \ IndexWriter.MaxFieldLength(512)) #for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')): list = os.listdir(DIRTOINDEX) for i in range(len(list)): tfile = os.path.join(DIRTOINDEX, list[i]) if os.path.isfile(tfile): print ("Indexing: ", tfile) print ('okokokook') document = Document() content = open(tfile, 'r').read() document.add(Field("text", content, Field.Store.YES, \ Field.Index.ANALYZED)) document.add(Field("title",str(tfile.strip('.txt')),Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) #print (document) print ("Done: ", tfile) index_writer.optimize() print (index_writer.numDocs()) index_writer.close()
def main(cls, argv): if len(argv) != 3: print "Usage: T9er <WordNet index dir> <t9 index>" return for key in cls.keys: c = key[0] k = key[1:] for kc in k: cls.keyMap[kc] = c print kc, "=", c indexDir = argv[1] t9dir = argv[2] reader = IndexReader.open(indexDir) numDocs = reader.maxDoc() print "Processing", numDocs, "words" writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True) for id in xrange(reader.maxDoc()): origDoc = reader.document(id) word = origDoc.get("word") if word is None or len(word) == 0: continue newDoc = Document() newDoc.add( Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add( Field("t9", cls.t9(word), Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add( Field("length", str(len(word)), Field.Store.NO, Field.Index.UN_TOKENIZED)) writer.addDocument(newDoc) if id % 100 == 0: print "Document", id writer.optimize() writer.close() reader.close()
def __init__(self): self.save = Field.Store.YES self.ddict = dict() lvm.attachCurrentThread() if not WorkflowIndexer.store and not WorkflowIndexer.writer: try: # open lucene index WorkflowIndexer.store = FSDirectory.open(File(location)) # if the index directory does not exist, create it. WorkflowIndexer.writer = IndexWriter( WorkflowIndexer.store, vistrailAnalyzer(), not len(WorkflowIndexer.store.list())) except Exception, e: print "EXCEPTION", e self.close() raise
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, self.porterAnalyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("contents", "The quick brown fox jumps over the lazy dogs", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close()
def setUp(self): directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc1 = Document() doc1.add( Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc1) doc2 = Document() doc2.add( Field("field", "the fast fox hopped over the hound", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc2) writer.close() self.searcher = IndexSearcher(directory, True)
def setUp(self): # set up sample document directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(directory)
def setUp(self): self.analyzer = WhitespaceAnalyzer() self.directory = RAMDirectory() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.LIMITED) for i in xrange(1, 501): doc = Document() doc.add(Field("id", NumberUtils.pad(i), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) writer.close()
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("partnum", "Q36", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("description", "Illidium Space Modulator", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True)