コード例 #1
0
ファイル: pylucene_test.py プロジェクト: SamChen1981/spider-1
def luceneIndexer(contents):
    lucene.initVM()
    

    INDEXIDR= settings.INDEX_DIR

    indexdir= SimpleFSDirectory(File(INDEXIDR))
    
    analyzer= StandardAnalyzer(Version.LUCENE_30)

    index_writer= IndexWriter(indexdir,analyzer,True,\

    IndexWriter.MaxFieldLength(512))
    for tfile in contents:
        print"Indexing: ", tfile

        document= Document()

        content= tfile.getvalue()

        document.add(Field("text",content,Field.Store.YES,\
                           Field.Index.ANALYZED))
        index_writer.addDocument(document)
        print"Done: ", tfile
        index_writer.optimize()
        print index_writer.numDocs()
    index_writer.close()
コード例 #2
0
ファイル: app.py プロジェクト: avinashkoulavkar/GUI
def configure_lucene():
    
    f = open('clique.txt','r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t','')
        line = line.replace('\r','')
        line = line.replace('\n','')
  	line = line.replace('^','')
    	line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()
コード例 #3
0
ファイル: app.py プロジェクト: ProjectLISM/GUI
def configure_lucene():

    f = open('clique.txt', 'r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs(
    )

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t', '')
        line = line.replace('\r', '')
        line = line.replace('\n', '')
        line = line.replace('^', '')
        line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (
        writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs(
    )
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs(
    )
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()
コード例 #4
0
ファイル: luceneInx.py プロジェクト: qiugen/pylucene_demo
def luceneIndexer(docdir,indir):
	""" IndexDocuments from a directory.
	Args:
		docdir:文档所在文件夹
		indir:索引存放文件夹
	Returns:
		无返回值
	说明:
	FieldType().setStored=as-is value stored in the Lucene index
	FieldType().setTokenized=field is analyzed using the specified Analyzer - the tokens emitted are indexed
	FieldType().Indexed = the text (either as-is with keyword fields, or the tokens from tokenized fields) is made searchable (aka inverted)
	FieldType().Vectored = term frequency per document is stored in the index in an easily retrievable fashion.
	"""
	
	"""#类型1属性:对于需要检索,需要返回显示setStored(True)
	type1 = FieldType()
	type1.setIndexed(True)
	type1.setStored(True)
	type1.setTokenized(False)
	type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
	#类型2属性:对于不用返回显示,但是需要进行检索的字段。这里我认为文本内容(content)是这一种的,通常例如文件的META信息。
	type2 = FieldType()
	type2.setIndexed(True)
	type2.setStored(False)
	type2.setTokenized(True)
	type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)"""
	
	lucene.initVM()
	DIRTOINDEX= docdir
	INDEXIDR= indir
	indexdir= SimpleFSDirectory(File(INDEXIDR))
	analyzer= StandardAnalyzer(Version.LUCENE_30)
	#用指定的语言分析器构造一个新的写索引器.
	index_writer= IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512))
	for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):
	    #print "Indexing: "
		print "Indexing:", tfile;
		document = Document()
		content = open(tfile,'r').read()
		#类型使用方式
		#doc.add(Field("path", tfile, type1))
		
		#文档新增字段(Field){字段名:"text",存储:“YES”,索引:"YES"}
		document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED))
		document.add(Field("path",tfile,Field.Store.YES,Field.Index.ANALYZED))
		index_writer.addDocument(document)
		print "Done: ", tfile
	index_writer.optimize()
	print index_writer.numDocs()
	index_writer.close()
コード例 #5
0
ファイル: indexer.py プロジェクト: liuyang1/test
def Indexer(docdir,indir):
	lucene.initVM()
	DIRTOINDEX   = docdir
	INDEXDIR     = indir
	indexdir     = FSDirectory(File(INDEXDIR))
	analyzer     = StandardAnalyzer(VERSION.LUCENE_30)
	index_writer = IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512))
	for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):
		print "Indexing ",tfile
		document=Document()
		content = open(tfile,'r').read()
		document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED))
		index_writer.addDocument(document)
		print "Done"
	index_writer.optimize()
	print index_writer.numDocs()
	index_writer.close()
コード例 #6
0
def lucene_index(input_folder,output_folder):
    '''
    Indexes fresh text data using lucene 3.6.
    Doesn't support incremental generation of index as of now.
    Currently crashes on neo by running out of heap space.
    Arguments: Input folder for text files. output folder for index location 
    Returns: void. The index is stored if generated.
    
    
    '''
    
    # Setting up log file
    logging.basicConfig(file=os.path.join(output_folder,"lucene_index.log"))
    logging.info("Input directory for logging: "+input_folder)
    logging.info("Output directory of index: "+output_folder)
    if  not os.path.isdir(output_folder):
        logger.debug("Making output directory for index: "+ output_folder)
        os.makedirs(output_folder)
    
    # Setting up lucene's heap size for index and version of indexer
    lucene.initVM(initialheap='1024m',maxheap='2048m')
    index_folder = SimpleFSDirectory(File(output_folder))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    writer = IndexWriter(index_folder, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED)
    
    # Optimization to reduce heap space usage for generation of index. Merges buffer with
    # current index after 15 docs.
    writer.setMergeFactor(15) 
    writer.setRAMBufferSizeMB(32.0)
    
    # Search to find the files to index
    files_to_index = find_files_in_folder(input_folder) 
    for input_file in files_to_index:
        doc = Document()
        content = open(input_file, 'r').read()
        doc.add(Field("text", content, Field.Store.NO, Field.Index.ANALYZED)) # Do not store text.Only index.
        doc.add(Field("path", input_file, Field.Store.YES, Field.Index.NO)) # Store path to assist in retreiving the file
        writer.addDocument(doc) # Index

    logger.info("Indexed lines from " +input_folder+" (%d documents in index)" % (writer.numDocs()))
    logger.info( "About to optimize index of %d documents..." % writer.numDocs())
    writer.optimize() # Compress index
    logger.info("...done optimizing index of %d documents" % writer.numDocs())
    logger.info("Closing index of %d documents..." % writer.numDocs())
    writer.close()
    logger.info("Closed index")
コード例 #7
0
ファイル: pylucene_build_idx.py プロジェクト: ri0day/gangster
def luceneIndexer(docdir,indir):

	"""

	Index Documents from a dirrcory

	"""

	lucene.initVM()

	DIRTOINDEX = docdir

	INDEXIDR = indir

	indexdir = SimpleFSDirectory(File(INDEXIDR))

	analyzer = StandardAnalyzer(Version.LUCENE_30)

	index_writer = IndexWriter(indexdir,analyzer,True,\

	IndexWriter.MaxFieldLength(512))

	for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):

		print "Indexing: ", tfile

		document = Document()

		content = open(tfile,'r').read()

		document.add(Field("text",content,Field.Store.YES,\

		Field.Index.ANALYZED))

		index_writer.addDocument(document)

		print "Done: ", tfile

	index_writer.optimize()

	print index_writer.numDocs()

	index_writer.close()
コード例 #8
0
ファイル: indexL.py プロジェクト: kansal/Sub-Event-Detection
def index(string):
 lucene.initVM()
 indexDir = "REMOVEME.index-dir"
 dir = SimpleFSDirectory(File(indexDir))
 analyzer = StandardAnalyzer(Version.LUCENE_30)
 writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

 print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

 doc = Document()
 doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED))
 writer.addDocument(doc)

 print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
 print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
 writer.optimize()
 print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
 print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
 writer.close()
コード例 #9
0
def luceneIndexer(docdir, indir):
    """

         IndexDocuments from a directory

         """

    lucene.initVM()

    DIRTOINDEX = docdir

    INDEXIDR = indir

    indexdir = SimpleFSDirectory(File(INDEXIDR))

    analyzer = StandardAnalyzer(Version.LUCENE_30)

    index_writer= IndexWriter(indexdir,analyzer,True,\

    IndexWriter.MaxFieldLength(512))

    for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')):

        print "Indexing: ", tfile

        document = Document()

        content = open(tfile, 'r').read()

        document.add(Field("text",content,Field.Store.YES,\

                 Field.Index.ANALYZED))

        index_writer.addDocument(document)

        print "Done: ", tfile

    index_writer.optimize()

    print index_writer.numDocs()

    index_writer.close()
コード例 #10
0
ファイル: Indexer.py プロジェクト: qiugen/pylucene-trunk
    def index(cls, indexDir, dataDir):

        if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
            raise IOError, "%s does not exist or is not a directory" %(dataDir)

        dir = SimpleFSDirectory(File(indexDir))
        writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT),
                             True, IndexWriter.MaxFieldLength.LIMITED)
        writer.setUseCompoundFile(False)

        cls.indexDirectory(writer, dataDir)

        numIndexed = writer.numDocs()
        writer.commit()
        writer.close()
        dir.close()

        return numIndexed
コード例 #11
0
    def index(cls, indexDir, dataDir):

        if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
            raise IOError, "%s does not exist or is not a directory" % (
                dataDir)

        dir = SimpleFSDirectory(File(indexDir))
        writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT),
                             True, IndexWriter.MaxFieldLength.LIMITED)
        writer.setUseCompoundFile(False)

        cls.indexDirectory(writer, dataDir)

        numIndexed = writer.numDocs()
        writer.optimize()
        writer.close()
        dir.close()

        return numIndexed
コード例 #12
0
ファイル: indexize.py プロジェクト: wncios/Readers
def luceneIndexer(docdir, indir):
    """frpFile
    IndexDocuments from a directory

    para:{
        docdir: the path of the txt file
        indir: the path of the index file which is generated by the following code
        }
    """

    lucene.initVM()
    DIRTOINDEX = docdir
    INDEXIDR = indir
    indexdir = SimpleFSDirectory(File(INDEXIDR))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    index_writer = IndexWriter(indexdir, analyzer, True, \
                               IndexWriter.MaxFieldLength(512))
    #for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')):
    list = os.listdir(DIRTOINDEX)
    for i in range(len(list)):
        tfile = os.path.join(DIRTOINDEX, list[i])
        if os.path.isfile(tfile):
            print ("Indexing: ", tfile)
            print ('okokokook')
            document = Document()
            content = open(tfile, 'r').read()
            document.add(Field("text", content, Field.Store.YES, \
                               Field.Index.ANALYZED))
            document.add(Field("title",str(tfile.strip('.txt')),Field.Store.YES,\
                               Field.Index.ANALYZED))
            index_writer.addDocument(document)
            #print (document)
            print ("Done: ", tfile)
    index_writer.optimize()
    print (index_writer.numDocs())
    index_writer.close()
コード例 #13
0
    def testIndexWriter(self):

        writer = IndexWriter(self.dir, self.getAnalyzer(), False,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        self.assertEqual(len(self.keywords), writer.numDocs())
        writer.close()
コード例 #14
0
from htmlparser import parsehtml
import lucene
import os
from lucene import SimpleFSDirectory, System, File, Document, Field, StandardAnalyzer, IndexWriter, Version
if __name__ == "__main__":
    lucene.initVM()
    src_dir = "html_files"
    indexDir = "index"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
    print ("Currently there are %d documents in the index..." % writer.numDocs())
    print ("Reading lines from directory...")
    i = 0
    for l in os.listdir(src_dir):
        l = os.path.join(src_dir, l)
        with open(l, 'r') as myfile:
            data=myfile.read()
        document, errors = parsehtml(data)
        print(l)
        i += 1
        doc = Document()
        doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
    print ("Indexed lines from stdin (%d documents in index)" % (writer.numDocs()))
    print ("About to optimize index of %d documents..." % writer.numDocs())
    writer.optimize()
    print ("...done optimizing index of %d documents" % writer.numDocs())
    print ("Closing index of %d documents..." % writer.numDocs())
    writer.close()
    print ("...done closing index of %d documents" % writer.numDocs())
コード例 #15
0
ファイル: create_index.py プロジェクト: swasheck/bible
INDEXDIR = "texts.index"
dir = SimpleFSDirectory(File(INDEXDIR))
analyzer = SimpleAnalyzer(Version.LUCENE_35)
writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
conn = psycopg2.connect("dbname=texts user=swasheck")
cur = conn.cursor()
cur.execute("select reference, version_id, analysis_text from verse;")
for verse in cur.fetchall():
	print "Adding %s (version=%s)" % (verse[0],verse[1])
	doc = Document()
	doc.add(Field("reference", verse[0], Field.Store.YES, Field.Index.ANALYZED))
	doc.add(Field("version", str(verse[1]), Field.Store.YES, Field.Index.ANALYZED))
	doc.add(Field("text", verse[2], Field.Store.YES, Field.Index.ANALYZED))
	writer.addDocument(doc)	
print 'Optimizing the index of %d documents...' % writer.numDocs()
writer.optimize()
print 'Closing the index'
writer.close()
'''
INDEXDIR = "greek.texts.index"
dir = SimpleFSDirectory(File(INDEXDIR))
el_analyzer = GreekAnalyzer(Version.LUCENE_35)
analyzer = SimpleAnalyzer(Version.LUCENE_35)
writer = IndexWriter(dir, el_analyzer, True, IndexWriter.MaxFieldLength(512))
conn = psycopg2.connect("dbname=texts user=swasheck")
cur = conn.cursor()
cur.execute("select reference, version_id, analysis_text from verse where version_id in (2,3);")
for verse in cur.fetchall():
	print "Adding %s (version=%s)" % (verse[0],verse[1])
	doc = Document()
コード例 #16
0
    indexDir = "/Tmp/REMOVEME.index-dir"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    #    # set variables that affect speed of indexing
    #    writer.setMergeFactor(int(argv[2]))
    #    writer.setMaxMergeDocs(int(argv[3]))
    #    writer.setMaxBufferedDocs(int(argv[4]))
    #    # writer.infoStream = System.out
    #
    #    print "Merge factor:  ", writer.getMergeFactor()
    #    print "Max merge docs:", writer.getMaxMergeDocs()
    #    print "Max buffered docs:", writer.getMaxBufferedDocs()

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs(
    )

    i = 0
    print >> sys.stderr, "Reading lines from sys.stdin..."
    for l in sys.stdin:
        i += 1

        if string.strip(l) == "": continue

        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        if i % 10000 == 0:
            print >> sys.stderr, "Read %d lines from stdin (%d documents in index)..." % (
                i, writer.numDocs())
コード例 #17
0
    def testIndexWriter(self):

        writer = IndexWriter(self.dir, self.getAnalyzer(), False,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        self.assertEqual(len(self.keywords), writer.numDocs())
        writer.close()
コード例 #18
0
ファイル: pylucene_test.py プロジェクト: danieldmm/minerva
from lucene import (SimpleFSDirectory, System, File,
    Document, Field, StandardAnalyzer, IndexWriter, IndexSearcher, QueryParser)

if __name__ == "__main__":
    lucene.initVM()
    fullIndexDir = r"c:\NLP\PhD\bob\fileDB\LuceneFullIndex"

    print "lucene version is:", lucene.VERSION

    fullIndex = SimpleFSDirectory(File(fullIndexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(fullIndex, analyzer, True, IndexWriter.MaxFieldLength(20000000))
##    writer = IndexWriter(store, analyzer, True, IndexWriter.MaxFieldLength(512))

    print  "Currently there are %d documents in the index..." % writer.numDocs()

##    print  "Reading lines from sys.stdin..."
    lines=["bla bla bla bla bla","Erase una vez que se era", "En un lugar de La Mancha de cuyo nombre no quiero acordarme, no ha mucho que vivia un hidalgo de los de lanza en ristre", "Manchame mancha mancha que te mancha la mancha"]

    for l in lines:
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        metadata={"asdfa":"asdfa"}
        json_metadata=json.dumps(metadata)
        doc.add(Field("metadata", json_metadata, Field.Store.YES, Field.Index.NO))
        writer.addDocument(doc)

    print "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
    print "About to optimize index of %d documents..." % writer.numDocs()
    writer.optimize()
コード例 #19
0
ファイル: pylucene_test.py プロジェクト: danduma/minerva
                    StandardAnalyzer, IndexWriter, IndexSearcher, QueryParser)

if __name__ == "__main__":
    lucene.initVM()
    fullIndexDir = r"c:\NLP\PhD\bob\fileDB\LuceneFullIndex"

    print("lucene version is:", lucene.VERSION)

    fullIndex = SimpleFSDirectory(File(fullIndexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(fullIndex, analyzer, True,
                         IndexWriter.MaxFieldLength(20000000))
    ##    writer = IndexWriter(store, analyzer, True, IndexWriter.MaxFieldLength(512))

    print("Currently there are %d documents in the index..." %
          writer.numDocs())

    ##    print  "Reading lines from sys.stdin..."
    lines = [
        "bla bla bla bla bla", "Erase una vez que se era",
        "En un lugar de La Mancha de cuyo nombre no quiero acordarme, no ha mucho que vivia un hidalgo de los de lanza en ristre",
        "Manchame mancha mancha que te mancha la mancha"
    ]

    for l in lines:
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        metadata = {"asdfa": "asdfa"}
        json_metadata = json.dumps(metadata)
        doc.add(
            Field("metadata", json_metadata, Field.Store.YES, Field.Index.NO))
コード例 #20
0
    indexDir = "/Tmp/REMOVEME.index-dir"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

#    # set variables that affect speed of indexing
#    writer.setMergeFactor(int(argv[2]))
#    writer.setMaxMergeDocs(int(argv[3]))
#    writer.setMaxBufferedDocs(int(argv[4]))
#    # writer.infoStream = System.out
#
#    print "Merge factor:  ", writer.getMergeFactor()
#    print "Max merge docs:", writer.getMaxMergeDocs()
#    print "Max buffered docs:", writer.getMaxBufferedDocs()

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

    i = 0
    print >> sys.stderr, "Reading lines from sys.stdin..."
    for l in sys.stdin:
        i += 1

        if string.strip(l) == "": continue

        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        if i % 10000 == 0:
            print >> sys.stderr, "Read %d lines from stdin (%d documents in index)..." % (i, writer.numDocs())
            print >> sys.stderr, stats()