Python IndexWriter.optimizeの例、lucene.IndexWriter.optimize Pythonの例

コード例 #1

0

ファイルを表示

    def addDocuments(self, dir):

        writer = IndexWriter(dir, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        #
        # change to adjust performance of indexing with FSDirectory
        # writer.mergeFactor = writer.mergeFactor
        # writer.maxMergeDocs = writer.maxMergeDocs
        # writer.minMergeDocs = writer.minMergeDocs
        #

        for word in self.docs:
            doc = Document()
            doc.add(
                Field("keyword", word, Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO))
            doc.add(
                Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()

コード例 #2

0

ファイルを表示

ファイル: docindexjob.py プロジェクト: tongji1907/woyaooo

def do_index():
    initVM()
    indexDir = "/home/william/woyaoo/luceneindex"
    version = Version.LUCENE_CURRENT
    standardAnalyzer = StandardAnalyzer(version)
    # chineseAnalyzer = CJKAnalyzer(version)
    engine = data.engine_from_config("indexdb.config")
    # engine = data.engine_from_config()
    db = data.init_datafactory(engine)
    docs = dbfactory.Session().query(doc_model.Doc).filter(doc_model.Doc.dateCreated > "20121220").all()
    print len(docs)
    idxDir = SimpleFSDirectory(File(indexDir))
    perIndexCount = 5000
    writer = IndexWriter(idxDir, standardAnalyzer, True, IndexWriter.MaxFieldLength(512))

    # add field
    for doc in docs:
        # print repr(doc.description)
        lucenedoc = Document()
        descriptionValue = doc.description.strip("\r\n").encode("UTF-8")
        # descriptionValue ='中国 abc'
        print repr(descriptionValue)
        lucenedoc.add(Field("url", doc.url, Field.Store.YES, Field.Index.NOT_ANALYZED))
        lucenedoc.add(Field("intent", doc.intent, Field.Store.YES, Field.Index.NOT_ANALYZED))
        # lucenedoc.add(Field('description', doc.description, Field.Store.YES, Field.Index.ANALYZED))
        lucenedoc.add(Field("description", descriptionValue, Field.Store.YES, Field.Index.ANALYZED))
        lucenedoc.add(Field("title", doc.title, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(lucenedoc)
        writer.optimize()
    writer.close()
    print "index finished"

コード例 #3

0

ファイルを表示

ファイル: subEventPylucene.py プロジェクト: kansal/Sub-Event-Detection

def index(string):
 lucene.initVM()
 indexDir = "REMOVEME.index-dir"
 dir = SimpleFSDirectory(File(indexDir))
 analyzer = StandardAnalyzer(Version.LUCENE_30)
 try:
  writer = IndexWriter(dir, analyzer, False, IndexWriter.MaxFieldLength(512))
 except lucene.JavaError:
  #print 'Inside Index Except'
  writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
#e = sys.exc_info()[0]
#print e
 #print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

 doc = Document()
 doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED))
 writer.addDocument(doc)
 #print 'In the index function'
 #print writer.numDocs()

#print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
#print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
 writer.optimize()
#print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
#print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
 #print 'ending Indexing'
 #print string 
 #print 'Total indexes'
 #print writer.numDocs() 
 writer.close()

コード例 #4

0

ファイルを表示

def indexDocuments():
    # empty index directory
    indexDir = Wikipedia.directory + 'index/'
    for filename in os.listdir(indexDir):
        os.remove(indexDir + filename)

    # index documents
    lucene.initVM()
    version = Version.LUCENE_CURRENT
    analyzer = EnglishAnalyzer(version)
    writer = IndexWriter(SimpleFSDirectory(File(indexDir)), analyzer, True,
                         IndexWriter.MaxFieldLength.LIMITED)

    for article in Wikipedia():
        doc = Document()
        doc.add(
            Field('id', str(article['id'][0]), Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        doc.add(
            Field('title', article['url'], Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        doc.add(
            Field('content', article['text'], Field.Store.NO,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)

    print 'Optimization'
    writer.optimize()
    writer.close()

コード例 #5

0

ファイルを表示

class BuildIndex:
    def __init__(self, indir):
        lucene.initVM()
        indexdir = SimpleFSDirectory(File(indir))
        self.index_writer = IndexWriter(indexdir, self.getAnalyzer(), True,
                                        IndexWriter.MaxFieldLength(512))

    def getAnalyzer(self):
        return ChineseAnalyzer(lucene.Version.LUCENE_CURRENT)

    def addDocuments(self, _id, title, content):
        doc = Document()
        doc.add(Field("id", _id, Field.Store.YES, Field.Index.NOT_ANALYZED))
        if title is not None and len(title) > 0:
            doc.add(
                Field("titleKeyword", title, Field.Store.NO,
                      Field.Index.ANALYZED))
        if content is not None and len(content) > 0:
            doc.add(
                Field("contentKeyword", content, Field.Store.NO,
                      Field.Index.ANALYZED))
        self.index_writer.addDocument(doc)

    def close(self):
        self.index_writer.optimize()
        self.index_writer.close()

コード例 #6

0

ファイルを表示

ファイル: indexing.py プロジェクト: farbod-s/Noormags

	def index( self ):
		lucene.initVM()
		indexdir = SimpleFSDirectory( File( self.INDEX_DIR ) )
		analyzer = StandardAnalyzer( Version.LUCENE_30 )
		index_writer = IndexWriter( indexdir, analyzer, True, IndexWriter.MaxFieldLength( 512 ) )
		# read input files (.xml)
		for in_file in glob.glob( os.path.join( self.DOC_DIR, '*.xml' ) ):
			corpus = codecs.open( in_file, encoding='utf-8' ).read()
			d = pq( corpus, parser='html' )
			for text in d( 'Article' ).items():
				document = Document()
				# find ID
				art_id = str( text.attr( 'articleid' ).encode( 'utf-8' ) ).replace( '+', '-' )
				# find Title
				art_title = self.stem( str( text.attr( 'title' ).encode( 'utf-8' ) ) )
				# find Abstract
				art_abstract = self.stem( str( text.find( 'Abstract' ).html().encode('utf-8') ) )
				# find Keyword
				art_keyword = text.find( 'Keyword' ).html().encode('utf-8')
				# find Content
				art_content = self.stem( str( text.find( 'Content' ).html().encode('utf-8') ) )
				# find Authors
				art_authors = text.find( 'Authors' ).html().encode('utf-8')
				document.add( Field( 'id', art_id, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'title', art_title, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'abstract', art_abstract, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'keyword', art_keyword, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'content', art_content, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'authors', art_authors, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'article', art_title + art_abstract + art_keyword + art_content,\
									 Field.Store.YES,\
									 Field.Index.ANALYZED ) )
				index_writer.addDocument( document )
			index_writer.optimize()
			index_writer.close()

コード例 #7

0

ファイルを表示

ファイル: LiveIndex.py プロジェクト: swalter2/knowledgeLexicalisation

    def update_index_withLineArray(self,array):
        """
        Parsed sentences (given in an array) are added to the index, with the corresponding two entities (x,y) and the DBpedia URI
        """
        print "start adding sentences"
        writer = IndexWriter(index_directory, analyzer, False, IndexWriter.MaxFieldLength(512))
        for item in array:
            line = item[0]
            x = item[1]
            y = item[2]
            uri = item[3]
            line=line.replace("\t"," ")
            line = line.replace("\n","  ")
            line = line.replace("   ","  ")
            try:
                
                doc = Document()
                doc.add(Field("Sentence", line, Field.Store.YES, Field.Index.ANALYZED))
                doc.add(Field("X", x, Field.Store.YES, Field.Index.ANALYZED))
                doc.add(Field("Y", y, Field.Store.YES, Field.Index.ANALYZED))
                doc.add(Field("URI", uri, Field.Store.YES, Field.Index.ANALYZED))
                writer.addDocument(doc)
                
            except Exception:
                print "Unexpected error:", sys.exc_info()[0]
                raw_input("Error in updating the Sentences")
        try:
            writer.optimize()
        except:
            print "Unexpected error:", sys.exc_info()[0]
            print ("could not optimize index")

        writer.close()
        print "all sentences added"

コード例 #8

0

ファイルを表示

ファイル: AnchorIndex.py プロジェクト: swalter2/knowledgeLexicalisation

 def index(self,path_to_index,path_files):
     'indexes anchor texts from a given folder'
     #lucene.initVM()
     indexDir = path_to_index
     directory_index = SimpleFSDirectory(File(indexDir))
     analyzer = StandardAnalyzer(Version.LUCENE_35)
     writer = IndexWriter(directory_index, analyzer, True, IndexWriter.MaxFieldLength(512))
     listOfPathes = []
     listOfPathes.extend(glob.glob(path_files+"*.txt"))
     counter = 0
     for path_to_file in listOfPathes:
         print path_to_file
         f = open(path_to_file,"r")
         for line in f:
             entry = line.split("\t")
             counter+=1
             """
             optimizes index after a certain amount of added documents
             """
             if counter%500000==0:
                 print counter
                 writer.optimize()
             doc = Document()
             doc.add(Field("anchor", entry[0], Field.Store.YES, Field.Index.ANALYZED))
             doc.add(Field("anchor_uri", entry[1], Field.Store.YES, Field.Index.ANALYZED))
             doc.add(Field("dbpedia_uri", entry[2], Field.Store.YES, Field.Index.ANALYZED))
             doc.add(Field("number", entry[3].replace("\n",""), Field.Store.YES, Field.Index.ANALYZED))
             writer.addDocument(doc)
         writer.optimize()
      
         f.close()
         
     writer.close()
     print counter
     print "done"

コード例 #9

0

ファイルを表示

ファイル: app.py プロジェクト: avinashkoulavkar/GUI

def configure_lucene():
    
    f = open('clique.txt','r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t','')
        line = line.replace('\r','')
        line = line.replace('\n','')
  	line = line.replace('^','')
    	line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()

コード例 #10

0

ファイルを表示

ファイル: pylucene_test.py プロジェクト: SamChen1981/spider-1

def luceneIndexer(contents):
    lucene.initVM()
    

    INDEXIDR= settings.INDEX_DIR

    indexdir= SimpleFSDirectory(File(INDEXIDR))
    
    analyzer= StandardAnalyzer(Version.LUCENE_30)

    index_writer= IndexWriter(indexdir,analyzer,True,\

    IndexWriter.MaxFieldLength(512))
    for tfile in contents:
        print"Indexing: ", tfile

        document= Document()

        content= tfile.getvalue()

        document.add(Field("text",content,Field.Store.YES,\
                           Field.Index.ANALYZED))
        index_writer.addDocument(document)
        print"Done: ", tfile
        index_writer.optimize()
        print index_writer.numDocs()
    index_writer.close()

コード例 #11

0

ファイルを表示

ファイル: DocumentUpdateTest.py プロジェクト: lauromoraes/pylucene

    def testUpdate(self):

        self.assertEqual(1, self.getHitCount("city", "Amsterdam"))

        reader = IndexReader.open(self.dir, False)
        reader.deleteDocuments(Term("city", "Amsterdam"))
        reader.close()

        writer = IndexWriter(self.dir, self.getAnalyzer(), False,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        doc = Document()
        doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("country", "Russia", Field.Store.YES, Field.Index.NO))
        doc.add(
            Field("contents", "St. Petersburg has lots of bridges",
                  Field.Store.NO, Field.Index.ANALYZED))
        doc.add(
            Field("city", "St. Petersburg", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.optimize()
        writer.close()

        self.assertEqual(0, self.getHitCount("city", "Amsterdam"))
        self.assertEqual(1, self.getHitCount("city", "Petersburg"))

コード例 #12

0

ファイルを表示

ファイル: app.py プロジェクト: ProjectLISM/GUI

def configure_lucene():

    f = open('clique.txt', 'r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs(
    )

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t', '')
        line = line.replace('\r', '')
        line = line.replace('\n', '')
        line = line.replace('^', '')
        line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (
        writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs(
    )
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs(
    )
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()

コード例 #13

0

ファイルを表示

ファイル: CompoundVersusMultiFileIndexTest.py プロジェクト: ustramooner/python-lucenepp

    def addDocuments(self, dir, isCompound):

        writer = IndexWriter(dir, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.LIMITED)
        writer.setUseCompoundFile(isCompound)

        # change to adjust performance of indexing with FSDirectory
        # writer.mergeFactor = writer.mergeFactor
        # writer.maxMergeDocs = writer.maxMergeDocs
        # writer.minMergeDocs = writer.minMergeDocs

        for word in self.docs:
            doc = Document()
            doc.add(Field("keyword", word,
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(Field("unindexed", word,
                          Field.Store.YES, Field.Index.NO))
            doc.add(Field("unstored", word,
                          Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("text", word,
                          Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()

コード例 #14

0

ファイルを表示

ファイル: indexer.py プロジェクト: mefagan/relevancefeedback-

def createIndex():
    #initialize lucene and jvm
    print("started indexer")
    lucene.initVM()
    indexDir = "/Tmp/REMOVEME.index-dir"

    
    #get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    
    #get index storage
    dir = lucene.SimpleFSDirectory(lucene.File(indexDir))
   
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    src_dir = 'html_files'
    i = 0
    for l in os.listdir(src_dir):
        l = os.path.join(src_dir, l)
        with open(l, 'r') as myfile:
            data=myfile.read()
        i += 1
        document, errors = parsehtml(data)
        doc = Document()
        doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
    writer.optimize()
    writer.close()

コード例 #15

0

ファイルを表示

ファイル: indexer.py プロジェクト: CrawlingFingers/ConcordiaCrawler

def index_files (files, index_directory):
  lucene.initVM()
  d = SimpleFSDirectory(File(index_directory))
  analyzer = StandardAnalyzer(Version.LUCENE_30)
  writer = IndexWriter(d, analyzer, True, IndexWriter.MaxFieldLength(512))
  for f in files:
    parse_file(f, writer)
  writer.optimize()
  writer.close()

コード例 #16

0

ファイルを表示

ファイル: indexer.py プロジェクト: CrawlingFingers/ConcordiaCrawler

def index_files(files, index_directory):
    lucene.initVM()
    d = SimpleFSDirectory(File(index_directory))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    writer = IndexWriter(d, analyzer, True, IndexWriter.MaxFieldLength(512))
    for f in files:
        parse_file(f, writer)
    writer.optimize()
    writer.close()

コード例 #17

0

ファイルを表示

ファイル: ScoreTest.py プロジェクト: lauromoraes/pylucene

    def indexSingleFieldDocs(self, fields):

        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        for field in fields:
            doc = Document()
            doc.add(field)
            writer.addDocument(doc)

        writer.optimize()
        writer.close()

コード例 #18

0

ファイルを表示

ファイル: IndexFiles.py プロジェクト: christineyen/location-extraction

    def __init__(self, root, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)
        store = FSDirectory.getDirectory(storeDir, True)
        writer = IndexWriter(store, analyzer, True)
        writer.setMaxFieldLength(1048576)
        self.indexDocs(root, writer)
        print 'optimizing index',
        writer.optimize()
        writer.close()
        print 'done'

コード例 #19

0

ファイルを表示

    def __init__(self, root, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)
        store = FSDirectory.getDirectory(storeDir, True)
        writer = IndexWriter(store, analyzer, True)
        writer.setMaxFieldLength(1048576)
        self.indexDocs(root, writer)
        print 'optimizing index',
        writer.optimize()
        writer.close()
        print 'done'

コード例 #20

0

ファイルを表示

ファイル: FileIndexer.py プロジェクト: bpgriner01/pylucene

    def index(cls, indexDir, dataDir):

        if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
            raise IOError, "%s does not exist or is not a directory" % (dataDir)

        writer = IndexWriter(indexDir, StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        writer.setUseCompoundFile(False)

        numIndexed = cls.indexDirectory(writer, dataDir)
        writer.optimize()
        writer.close()

        return numIndexed

コード例 #21

0

ファイルを表示

ファイル: luceneInx.py プロジェクト: qiugen/pylucene_demo

def luceneIndexer(docdir,indir):
	""" IndexDocuments from a directory.
	Args:
		docdir:文档所在文件夹
		indir:索引存放文件夹
	Returns:
		无返回值
	说明：
	FieldType().setStored=as-is value stored in the Lucene index
	FieldType().setTokenized=field is analyzed using the specified Analyzer - the tokens emitted are indexed
	FieldType().Indexed = the text (either as-is with keyword fields, or the tokens from tokenized fields) is made searchable (aka inverted)
	FieldType().Vectored = term frequency per document is stored in the index in an easily retrievable fashion.
	"""
	
	"""#类型1属性：对于需要检索，需要返回显示setStored(True)
	type1 = FieldType()
	type1.setIndexed(True)
	type1.setStored(True)
	type1.setTokenized(False)
	type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
	#类型2属性：对于不用返回显示，但是需要进行检索的字段。这里我认为文本内容（content）是这一种的，通常例如文件的META信息。
	type2 = FieldType()
	type2.setIndexed(True)
	type2.setStored(False)
	type2.setTokenized(True)
	type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)"""
	
	lucene.initVM()
	DIRTOINDEX= docdir
	INDEXIDR= indir
	indexdir= SimpleFSDirectory(File(INDEXIDR))
	analyzer= StandardAnalyzer(Version.LUCENE_30)
	#用指定的语言分析器构造一个新的写索引器.
	index_writer= IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512))
	for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):
	    #print "Indexing: "
		print "Indexing:", tfile;
		document = Document()
		content = open(tfile,'r').read()
		#类型使用方式
		#doc.add(Field("path", tfile, type1))
		
		#文档新增字段（Field）{字段名："text",存储：“YES”,索引:"YES"}
		document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED))
		document.add(Field("path",tfile,Field.Store.YES,Field.Index.ANALYZED))
		index_writer.addDocument(document)
		print "Done: ", tfile
	index_writer.optimize()
	print index_writer.numDocs()
	index_writer.close()

コード例 #22

0

ファイルを表示

ファイル: plush.py プロジェクト: jjguy/plush

 def initDummyStore(self, directory):
     """Open a dummy ramdirectory for testing."""
     writer = IndexWriter(directory, SimpleAnalyzer(), True)
     doc = Document()
     doc.add(Field("name", 'dummy.txt', Field.Store.YES,
                   Field.Index.UN_TOKENIZED))
     doc.add(Field("path", '/path/to/dummy.txt', Field.Store.YES,
                   Field.Index.UN_TOKENIZED))
     doc.add(Field("path", '/path/to/another/dummy.txt', Field.Store.YES,
                   Field.Index.UN_TOKENIZED))
     doc.add(Field("contents", "foo dummy bar", Field.Store.YES,
                   Field.Index.TOKENIZED))
     writer.addDocument(doc)
     writer.optimize()
     writer.close()

コード例 #23

0

ファイルを表示

ファイル: TestDataDocumentHandler.py プロジェクト: bpgriner01/pylucene

    def createIndex(cls, dataDir, indexDir, useCompound):

        indexDir = SimpleFSDirectory(File(indexDir))
        writer = IndexWriter(
            indexDir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.UNLIMITED
        )
        writer.setUseCompoundFile(useCompound)

        for dir, dirnames, filenames in os.walk(dataDir):
            for filename in filenames:
                if filename.endswith(".properties"):
                    cls.indexFile(writer, os.path.join(dir, filename), dataDir)

        writer.optimize()
        writer.close()

コード例 #24

0

ファイルを表示

ファイル: TestDataDocumentHandler.py プロジェクト: lauromoraes/pylucene

    def createIndex(cls, dataDir, indexDir, useCompound):

        indexDir = SimpleFSDirectory(File(indexDir))
        writer = IndexWriter(indexDir,
                             StandardAnalyzer(Version.LUCENE_CURRENT), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        writer.setUseCompoundFile(useCompound)

        for dir, dirnames, filenames in os.walk(dataDir):
            for filename in filenames:
                if filename.endswith('.properties'):
                    cls.indexFile(writer, os.path.join(dir, filename), dataDir)

        writer.optimize()
        writer.close()

コード例 #25

0

ファイルを表示

ファイル: T9er.py プロジェクト: lauromoraes/pylucene

    def main(cls, argv):

        if len(argv) != 3:
            print "Usage: T9er <WordNet index dir> <t9 index>"
            return

        for key in cls.keys:
            c = key[0]
            k = key[1:]
            for kc in k:
                cls.keyMap[kc] = c
                print kc, "=", c

        indexDir = argv[1]
        t9dir = argv[2]

        reader = IndexReader.open(indexDir)

        numDocs = reader.maxDoc()
        print "Processing", numDocs, "words"

        writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True)

        for id in xrange(reader.maxDoc()):
            origDoc = reader.document(id)
            word = origDoc.get("word")
            if word is None or len(word) == 0:
                continue

            newDoc = Document()
            newDoc.add(
                Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED))
            newDoc.add(
                Field("t9", cls.t9(word), Field.Store.YES,
                      Field.Index.UN_TOKENIZED))
            newDoc.add(
                Field("length", str(len(word)), Field.Store.NO,
                      Field.Index.UN_TOKENIZED))
            writer.addDocument(newDoc)
            if id % 100 == 0:
                print "Document", id

        writer.optimize()
        writer.close()

        reader.close()

コード例 #26

0

ファイルを表示

ファイル: lucene_index.py プロジェクト: clintpgeorge/ediscovery

def lucene_index(input_folder,output_folder):
    '''
    Indexes fresh text data using lucene 3.6.
    Doesn't support incremental generation of index as of now.
    Currently crashes on neo by running out of heap space.
    Arguments: Input folder for text files. output folder for index location 
    Returns: void. The index is stored if generated.
    
    
    '''
    
    # Setting up log file
    logging.basicConfig(file=os.path.join(output_folder,"lucene_index.log"))
    logging.info("Input directory for logging: "+input_folder)
    logging.info("Output directory of index: "+output_folder)
    if  not os.path.isdir(output_folder):
        logger.debug("Making output directory for index: "+ output_folder)
        os.makedirs(output_folder)
    
    # Setting up lucene's heap size for index and version of indexer
    lucene.initVM(initialheap='1024m',maxheap='2048m')
    index_folder = SimpleFSDirectory(File(output_folder))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    writer = IndexWriter(index_folder, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED)
    
    # Optimization to reduce heap space usage for generation of index. Merges buffer with
    # current index after 15 docs.
    writer.setMergeFactor(15) 
    writer.setRAMBufferSizeMB(32.0)
    
    # Search to find the files to index
    files_to_index = find_files_in_folder(input_folder) 
    for input_file in files_to_index:
        doc = Document()
        content = open(input_file, 'r').read()
        doc.add(Field("text", content, Field.Store.NO, Field.Index.ANALYZED)) # Do not store text.Only index.
        doc.add(Field("path", input_file, Field.Store.YES, Field.Index.NO)) # Store path to assist in retreiving the file
        writer.addDocument(doc) # Index

    logger.info("Indexed lines from " +input_folder+" (%d documents in index)" % (writer.numDocs()))
    logger.info( "About to optimize index of %d documents..." % writer.numDocs())
    writer.optimize() # Compress index
    logger.info("...done optimizing index of %d documents" % writer.numDocs())
    logger.info("Closing index of %d documents..." % writer.numDocs())
    writer.close()
    logger.info("Closed index")

コード例 #27

0

ファイルを表示

    def index(self):

        dirPath = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
                               "verbose-index")
        dir = FSDirectory.getDirectory(dirPath, True)
        writer = IndexWriter(dir, SimpleAnalyzer(), True)

        writer.setInfoStream(System.out)

        for i in xrange(100):
            doc = Document()
            doc.add(Field("keyword", "goober",
                             Field.Store.YES, Field.Index.UN_TOKENIZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()

コード例 #28

0

ファイルを表示

ファイル: indexer.py プロジェクト: liuyang1/test

def Indexer(docdir,indir):
	lucene.initVM()
	DIRTOINDEX   = docdir
	INDEXDIR     = indir
	indexdir     = FSDirectory(File(INDEXDIR))
	analyzer     = StandardAnalyzer(VERSION.LUCENE_30)
	index_writer = IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512))
	for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):
		print "Indexing ",tfile
		document=Document()
		content = open(tfile,'r').read()
		document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED))
		index_writer.addDocument(document)
		print "Done"
	index_writer.optimize()
	print index_writer.numDocs()
	index_writer.close()

コード例 #29

0

ファイルを表示

ファイル: VerboseIndexing.py プロジェクト: pombredanne/python-lucenepp

    def index(self):

        dirPath = os.path.join(tempfile.gettempdir(),
                               "verbose-index")
        dir = FSDirectory.open(dirPath)
        writer = IndexWriter(dir, SimpleAnalyzer(), True)

        writer.setInfoStream(InfoStreamOut())

        for i in xrange(100):
            doc = Document()
            doc.add(Field("keyword", "goober",
                             Field.Store.YES, Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()

コード例 #30

0

ファイルを表示

ファイル: Indexer.py プロジェクト: bpgriner01/pylucene

    def index(cls, indexDir, dataDir):

        if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
            raise IOError, "%s does not exist or is not a directory" %(dataDir)

        dir = SimpleFSDirectory(File(indexDir))
        writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT),
                             True, IndexWriter.MaxFieldLength.LIMITED)
        writer.setUseCompoundFile(False)

        cls.indexDirectory(writer, dataDir)

        numIndexed = writer.numDocs()
        writer.optimize()
        writer.close()
        dir.close()

        return numIndexed

コード例 #31

0

ファイルを表示

ファイル: indexL.py プロジェクト: kansal/Sub-Event-Detection

def index(string):
 lucene.initVM()
 indexDir = "REMOVEME.index-dir"
 dir = SimpleFSDirectory(File(indexDir))
 analyzer = StandardAnalyzer(Version.LUCENE_30)
 writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

 print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

 doc = Document()
 doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED))
 writer.addDocument(doc)

 print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
 print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
 writer.optimize()
 print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
 print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
 writer.close()

コード例 #32

0

ファイルを表示

    def index(cls, indexDir, dataDir):

        if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
            raise IOError, "%s does not exist or is not a directory" % (
                dataDir)

        dir = SimpleFSDirectory(File(indexDir))
        writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT),
                             True, IndexWriter.MaxFieldLength.LIMITED)
        writer.setUseCompoundFile(False)

        cls.indexDirectory(writer, dataDir)

        numIndexed = writer.numDocs()
        writer.optimize()
        writer.close()
        dir.close()

        return numIndexed

コード例 #33

0

ファイルを表示

ファイル: pylucene_build_idx.py プロジェクト: ri0day/gangster

def luceneIndexer(docdir,indir):

	"""

	Index Documents from a dirrcory

	"""

	lucene.initVM()

	DIRTOINDEX = docdir

	INDEXIDR = indir

	indexdir = SimpleFSDirectory(File(INDEXIDR))

	analyzer = StandardAnalyzer(Version.LUCENE_30)

	index_writer = IndexWriter(indexdir,analyzer,True,\

	IndexWriter.MaxFieldLength(512))

	for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):

		print "Indexing: ", tfile

		document = Document()

		content = open(tfile,'r').read()

		document.add(Field("text",content,Field.Store.YES,\

		Field.Index.ANALYZED))

		index_writer.addDocument(document)

		print "Done: ", tfile

	index_writer.optimize()

	print index_writer.numDocs()

	index_writer.close()

コード例 #34

0

ファイルを表示

def luceneIndexer(docdir, indir):
    """

         IndexDocuments from a directory

         """

    lucene.initVM()

    DIRTOINDEX = docdir

    INDEXIDR = indir

    indexdir = SimpleFSDirectory(File(INDEXIDR))

    analyzer = StandardAnalyzer(Version.LUCENE_30)

    index_writer= IndexWriter(indexdir,analyzer,True,\

    IndexWriter.MaxFieldLength(512))

    for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')):

        print "Indexing: ", tfile

        document = Document()

        content = open(tfile, 'r').read()

        document.add(Field("text",content,Field.Store.YES,\

                 Field.Index.ANALYZED))

        index_writer.addDocument(document)

        print "Done: ", tfile

    index_writer.optimize()

    print index_writer.numDocs()

    index_writer.close()

コード例 #35

0

ファイルを表示

    def index(self, doc, title, department, url):
        indexdir = SimpleFSDirectory(File(self.indir))
        analyzer = StandardAnalyzer(Version.LUCENE_30)
        index_writer = IndexWriter(indexdir, analyzer, self.init,
                                   IndexWriter.MaxFieldLength(512))
        self.init = False

        # Initialize document and index it
        document = Document()
        document.add(
            Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
        document.add(Field("url", url, Field.Store.YES, Field.Index.ANALYZED))
        document.add(
            Field("department", department, Field.Store.YES,
                  Field.Index.ANALYZED))
        document.add(Field("text", doc, Field.Store.YES, Field.Index.ANALYZED))
        index_writer.addDocument(document)

        index_writer.optimize()
        index_writer.close()

コード例 #36

0

ファイルを表示

    def _createIndex(self, inputDF, colname):
        """
		function to create lucene index, iterates over inputDF row 
		by row, and indexes the relevant column

		By default - WhitespaceAnalyzer is used, other Analyzers are also available.
		"""

        # Create index directory
        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.LIMITED)

        # Inline indexing of column data
        inputDF.apply(lambda x: self._addDoc(x[colname], writer), axis=1)

        # Optimize, close and return
        writer.optimize()
        writer.close()
        return directory

コード例 #37

0

ファイルを表示

ファイル: LiveIndex.py プロジェクト: swalter2/knowledgeLexicalisation

 def update_index_withLine(self,line,x,y,uri):
     """
     Parsed sentence is added to the index, with the corresponding two entities (x,y) and the DBpedia URI
     """
     line=line.replace("\t"," ")
     line = line.replace("\n","  ")
     line = line.replace("   ","  ")
     try:
         writer = IndexWriter(index_directory, analyzer, False, IndexWriter.MaxFieldLength(512))
         doc = Document()
         doc.add(Field("Sentence", line, Field.Store.YES, Field.Index.ANALYZED))
         doc.add(Field("X", x, Field.Store.YES, Field.Index.ANALYZED))
         doc.add(Field("Y", y, Field.Store.YES, Field.Index.ANALYZED))
         doc.add(Field("URI", uri, Field.Store.YES, Field.Index.ANALYZED))
         writer.addDocument(doc)
         writer.optimize()
         writer.close() 
     except Exception:
         print "Unexpected error:", sys.exc_info()[0]
         raw_input("Error in updating the Sentences")

コード例 #38

0

ファイルを表示

    def addDocuments(self, dir, maxFieldLength):

        writer = IndexWriter(dir, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength(maxFieldLength))
        
        for keyword, unindexed, unstored, text in \
                izip(self.keywords, self.unindexed, self.unstored, self.text):
            doc = Document()
            doc.add(Field("id", keyword,
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(Field("country", unindexed,
                          Field.Store.YES, Field.Index.NO))
            doc.add(Field("contents", unstored,
                          Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("city", text,
                          Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()

コード例 #39

0

ファイルを表示

ファイル: BaseIndexingTestCase.py プロジェクト: pombredanne/python-lucenepp

    def addDocuments(self, dir):

        writer = IndexWriter(dir, self.getAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        writer.setUseCompoundFile(self.isCompound())

        for i in xrange(len(self.keywords)):
            doc = Document()
            doc.add(Field("id", self.keywords[i],
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(Field("country", self.unindexed[i],
                          Field.Store.YES, Field.Index.NO))
            doc.add(Field("contents", self.unstored[i],
                          Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("city", self.text[i],
                          Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()

コード例 #40

0

ファイルを表示

ファイル: T9er.py プロジェクト: bpgriner01/pylucene

    def main(cls, argv):

        if len(argv) != 3:
            print "Usage: T9er <WordNet index dir> <t9 index>"
            return

        for key in cls.keys:
            c = key[0]
            k = key[1:]
            for kc in k:
                cls.keyMap[kc] = c
                print kc, "=", c

        indexDir = argv[1]
        t9dir = argv[2]

        reader = IndexReader.open(indexDir)

        numDocs = reader.maxDoc()
        print "Processing", numDocs, "words"

        writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True)

        for id in xrange(reader.maxDoc()):
            origDoc = reader.document(id)
            word = origDoc.get("word")
            if word is None or len(word) == 0:
                continue

            newDoc = Document()
            newDoc.add(Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED))
            newDoc.add(Field("t9", cls.t9(word), Field.Store.YES, Field.Index.UN_TOKENIZED))
            newDoc.add(Field("length", str(len(word)), Field.Store.NO, Field.Index.UN_TOKENIZED))
            writer.addDocument(newDoc)
            if id % 100 == 0:
                print "Document", id

        writer.optimize()
        writer.close()

        reader.close()

コード例 #41

0

ファイルを表示

ファイル: indexer.py プロジェクト: nournia/wikifier

def indexDocuments():
	# empty index directory
	indexDir = Wikipedia.directory + 'index/'
	for filename in os.listdir(indexDir): os.remove(indexDir + filename)

	# index documents
	lucene.initVM()
	version = Version.LUCENE_CURRENT
	analyzer = EnglishAnalyzer(version)
	writer = IndexWriter(SimpleFSDirectory(File(indexDir)), analyzer, True, IndexWriter.MaxFieldLength.LIMITED)

	for article in Wikipedia():
		doc = Document()
		doc.add(Field('id', str(article['id'][0]), Field.Store.YES, Field.Index.NOT_ANALYZED))
		doc.add(Field('title', article['url'], Field.Store.YES, Field.Index.NOT_ANALYZED))
		doc.add(Field('content', article['text'], Field.Store.NO, Field.Index.ANALYZED))
		writer.addDocument(doc)

	print 'Optimization'
	writer.optimize()
	writer.close()

コード例 #42

0

ファイルを表示

    def testDeleteAfterIndexMerge(self):

        reader = IndexReader.open(self.dir, False)
        self.assertEqual(2, reader.maxDoc())
        self.assertEqual(2, reader.numDocs())
        reader.deleteDocument(1)
        reader.close()

        writer = IndexWriter(self.dir, self.getAnalyzer(), False,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        writer.optimize()
        writer.close()

        reader = IndexReader.open(self.dir, True)

        self.assert_(not reader.isDeleted(1))
        self.assert_(not reader.hasDeletions())
        self.assertEqual(1, reader.maxDoc())
        self.assertEqual(1, reader.numDocs())

        reader.close()

コード例 #43

0

ファイルを表示

ファイル: build.py プロジェクト: TalLinzen/hebrew-blog-corpus

class IndexCorpus(object):

    def __init__(self, index_dir, analyzer, compress=False):
        self.metadata = True
        if not os.path.exists(index_dir):
            os.mkdir(index_dir)

        self.compress = compress
        store = SimpleFSDirectory(File(index_dir))
        self.writer = IndexWriter(store, analyzer, True, 
                IndexWriter.MaxFieldLength.LIMITED)
        self.writer.setMaxFieldLength(1048576)

        if self.compress:
            self.compressor = self.get_compressor()
        
    def get_compressor(self):
        path = '/Users/tal/corpus/analyzed/5/5344'
        training_data = codecs.open(path, encoding='utf8').read()
        return trained_short_string_compressor(training_data.encode('utf8'))

    def finalize(self):
        self.writer.optimize()
        self.writer.close()

    def index(self, directory):
        files = [x for x in os.listdir(directory) if x.isdigit()]
        for filename in sorted(files, key=int):
            path = os.path.join(directory, filename)
            if not filename.isdigit():
                continue
            if os.path.isdir(path):
                self.index(path)
            else:
                if int(filename) % 100 == 0:
                    print datetime.now().ctime(), filename
                try:
                    self.index_file(path)
                except Exception, e:
                    print "Indexing exception:", e

コード例 #44

0

ファイルを表示

ファイル: DocumentUpdateTest.py プロジェクト: bpgriner01/pylucene

    def testUpdate(self):

        self.assertEqual(1, self.getHitCount("city", "Amsterdam"))

        reader = IndexReader.open(self.dir, False)
        reader.deleteDocuments(Term("city", "Amsterdam"))
        reader.close()

        writer = IndexWriter(self.dir, self.getAnalyzer(), False,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        doc = Document()
        doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("country", "Russia",
                      Field.Store.YES, Field.Index.NO))
        doc.add(Field("contents", "St. Petersburg has lots of bridges",
                      Field.Store.NO, Field.Index.ANALYZED))
        doc.add(Field("city", "St. Petersburg",
                      Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.optimize()
        writer.close()

        self.assertEqual(0, self.getHitCount("city", "Amsterdam"))
        self.assertEqual(1, self.getHitCount("city", "Petersburg"))

コード例 #45

0

ファイルを表示

ファイル: indexize.py プロジェクト: wncios/Readers

def luceneIndexer(docdir, indir):
    """frpFile
    IndexDocuments from a directory

    para:{
        docdir: the path of the txt file
        indir: the path of the index file which is generated by the following code
        }
    """

    lucene.initVM()
    DIRTOINDEX = docdir
    INDEXIDR = indir
    indexdir = SimpleFSDirectory(File(INDEXIDR))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    index_writer = IndexWriter(indexdir, analyzer, True, \
                               IndexWriter.MaxFieldLength(512))
    #for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')):
    list = os.listdir(DIRTOINDEX)
    for i in range(len(list)):
        tfile = os.path.join(DIRTOINDEX, list[i])
        if os.path.isfile(tfile):
            print ("Indexing: ", tfile)
            print ('okokokook')
            document = Document()
            content = open(tfile, 'r').read()
            document.add(Field("text", content, Field.Store.YES, \
                               Field.Index.ANALYZED))
            document.add(Field("title",str(tfile.strip('.txt')),Field.Store.YES,\
                               Field.Index.ANALYZED))
            index_writer.addDocument(document)
            #print (document)
            print ("Done: ", tfile)
    index_writer.optimize()
    print (index_writer.numDocs())
    index_writer.close()

コード例 #46

0

ファイルを表示

ファイル: TaggedIndex.py プロジェクト: swalter2/knowledgeLexicalisation

 def index(self,path_to_index,sentencearray):
     'indexes wikipedia sentences'
     lucene.initVM()
     indexDir = path_to_index
     dir = SimpleFSDirectory(File(indexDir))
     analyzer = StandardAnalyzer(Version.LUCENE_35)
     writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
     
         
     counter = 0
     for file_name in sentencearray:
         print file_name
         f = open(file_name,"r")
         for line in f:
             counter += 1
             line = line.replace("\n","")
             if "\t" in line:
                 tmp = line.split("\t")
             else:
                 tmp = [line,"0"]
             doc = Document()
             #print("sentence", tmp[0])
             #print ("key", tmp[1])
             #print
             doc.add(Field("sentence", tmp[0], Field.Store.YES, Field.Index.ANALYZED))
             doc.add(Field("key", tmp[1], Field.Store.YES, Field.Index.ANALYZED))
 #            doc.add(IntField("key", tmp[1], Field.Store.YES, Field.Index.ANALYZED))
             writer.addDocument(doc)
             if counter%1000000 == 0:
                 writer.optimize()
                 print counter
         writer.optimize()
         f.close()
     writer.close() 
     print "Done"
     print counter

コード例 #47

0

ファイルを表示

ファイル: BaseIndexingTestCase.py プロジェクト: lauromoraes/pylucene

    def addDocuments(self, dir):

        writer = IndexWriter(dir, self.getAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        writer.setUseCompoundFile(self.isCompound())

        for i in xrange(len(self.keywords)):
            doc = Document()
            doc.add(
                Field("id", self.keywords[i], Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(
                Field("country", self.unindexed[i], Field.Store.YES,
                      Field.Index.NO))
            doc.add(
                Field("contents", self.unstored[i], Field.Store.NO,
                      Field.Index.ANALYZED))
            doc.add(
                Field("city", self.text[i], Field.Store.YES,
                      Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()

コード例 #48

0

ファイルを表示

ファイル: BerkeleyDbIndexer.py プロジェクト: lauromoraes/pylucene

    def main(cls, argv):

        if len(argv) < 2:
            print "Usage: BerkeleyDbIndexer <index dir> -create"
            return

        dbHome = argv[1]
        create = len(argv) > 2 and argv[2] == "-create"

        if not os.path.exists(dbHome):
            os.makedirs(dbHome)
        elif create:
            for name in os.listdir(dbHome):
                if name.startswith('__'):
                    os.remove(os.path.join(dbHome, name))

        env = DBEnv()
        env.set_flags(DB_LOG_INMEMORY, 1)
        if os.name == 'nt':
            env.set_cachesize(0, 0x4000000, 1)
        elif os.name == 'posix':
            from commands import getstatusoutput
            if getstatusoutput('uname') == (0, 'Linux'):
                env.set_cachesize(0, 0x4000000, 1)

        env.open(dbHome, (DB_CREATE | DB_THREAD | DB_INIT_MPOOL | DB_INIT_LOCK
                          | DB_INIT_TXN), 0)

        index = DB(env)
        blocks = DB(env)
        txn = None

        try:
            txn = env.txn_begin(None)
            index.open(filename='__index__',
                       dbtype=DB_BTREE,
                       flags=DB_CREATE | DB_THREAD,
                       txn=txn)
            blocks.open(filename='__blocks__',
                        dbtype=DB_BTREE,
                        flags=DB_CREATE | DB_THREAD,
                        txn=txn)
        except:
            if txn is not None:
                txn.abort()
                txn = None
            raise
        else:
            txn.commit()
            txn = None

        try:
            txn = env.txn_begin(None)
            directory = DbDirectory(txn, index, blocks, 0)
            writer = IndexWriter(directory, StandardAnalyzer(), create,
                                 IndexWriter.MaxFieldLength.UNLIMITED)
            writer.setUseCompoundFile(False)

            doc = Document()
            doc.add(
                Field("contents", "The quick brown fox...", Field.Store.YES,
                      Field.Index.ANALYZED))
            writer.addDocument(doc)

            writer.optimize()
            writer.close()
        except:
            if txn is not None:
                txn.abort()
                txn = None
            raise
        else:
            txn.commit()
            index.close()
            blocks.close()
            env.close()

        print "Indexing Complete"

コード例 #49

0

ファイルを表示

ファイル: cnanalyzer.py プロジェクト: fay/wt

        return StopFilter(LowerCaseFilter(LetterTokenizer(reader)),
                          self.stopWords)
if __name__ == '__main__':
    analyzer = CJKAnalyzer()
    directory = RAMDirectory()
    ireader = IndexReader.open(STORE_DIR)
    iwriter = IndexWriter(directory, StandardAnalyzer(), True)
    ts = ["javasd。 $#＃open所大家教唆犯地方地方即可解放大家空间艰苦奋斗矿井口地方", "所看看对抗赛不久交会法觉得拮抗剂"]
    for t in ts:
        doc = Document()
        doc.add(Field("fieldname", t,
                      Field.Store.YES, Field.Index.TOKENIZED,
                      Field.TermVector.WITH_POSITIONS_OFFSETS))
        iwriter.addDocument(doc)
    iwriter.optimize()
    iwriter.close()
    ireader = IndexReader.open(directory)
    tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'fieldname'))
    
    for (t, f, i) in zip(tpv.getTerms(), tpv.getTermFrequencies(), xrange(100000)):
        print 'term %s' % t
        print '  freq: %i' % f
        try:
            print '  pos: ' + str([p for p in tpv.getTermPositions(i)])
        except:
            print '  no pos'
        try:
            print '  off: ' + \
                  str(["%i-%i" % (o.getStartOffset(), o.getEndOffset())
                       for o in tpv.getOffsets(i)])

コード例 #50

0

ファイルを表示

    doc.add(
        Field("keywords", ' '.join((command, name, synopsis, description)),
              Field.Store.NO, Field.Index.ANALYZED))
    doc.add(
        Field("filename", os.path.abspath(path), Field.Store.YES,
              Field.Index.NOT_ANALYZED))

    writer.addDocument(doc)


if __name__ == '__main__':

    if len(sys.argv) != 2:
        print "Usage: python manindex.py <index dir>"

    else:
        initVM()
        indexDir = sys.argv[1]
        writer = IndexWriter(SimpleFSDirectory(File(indexDir)),
                             StandardAnalyzer(Version.LUCENE_CURRENT), True,
                             IndexWriter.MaxFieldLength.LIMITED)
        manpath = os.environ.get('MANPATH', '/usr/share/man').split(os.pathsep)
        for dir in manpath:
            print "Crawling", dir
            for name in os.listdir(dir):
                path = os.path.join(dir, name)
                if os.path.isdir(path):
                    indexDirectory(path)
        writer.optimize()
        writer.close()