コード例 #1
0
ファイル: indexer.py プロジェクト: mefagan/relevancefeedback-
def createIndex():
    #initialize lucene and jvm
    print("started indexer")
    lucene.initVM()
    indexDir = "/Tmp/REMOVEME.index-dir"

    
    #get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    
    #get index storage
    dir = lucene.SimpleFSDirectory(lucene.File(indexDir))
   
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    src_dir = 'html_files'
    i = 0
    for l in os.listdir(src_dir):
        l = os.path.join(src_dir, l)
        with open(l, 'r') as myfile:
            data=myfile.read()
        i += 1
        document, errors = parsehtml(data)
        doc = Document()
        doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
    writer.optimize()
    writer.close()
コード例 #2
0
ファイル: pylucene_test.py プロジェクト: SamChen1981/spider-1
def luceneIndexer(contents):
    lucene.initVM()
    

    INDEXIDR= settings.INDEX_DIR

    indexdir= SimpleFSDirectory(File(INDEXIDR))
    
    analyzer= StandardAnalyzer(Version.LUCENE_30)

    index_writer= IndexWriter(indexdir,analyzer,True,\

    IndexWriter.MaxFieldLength(512))
    for tfile in contents:
        print"Indexing: ", tfile

        document= Document()

        content= tfile.getvalue()

        document.add(Field("text",content,Field.Store.YES,\
                           Field.Index.ANALYZED))
        index_writer.addDocument(document)
        print"Done: ", tfile
        index_writer.optimize()
        print index_writer.numDocs()
    index_writer.close()
コード例 #3
0
ファイル: app.py プロジェクト: ProjectLISM/GUI
def configure_lucene():

    f = open('clique.txt', 'r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs(
    )

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t', '')
        line = line.replace('\r', '')
        line = line.replace('\n', '')
        line = line.replace('^', '')
        line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (
        writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs(
    )
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs(
    )
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()
コード例 #4
0
def index_files(files, index_directory):
    lucene.initVM()
    d = SimpleFSDirectory(File(index_directory))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    writer = IndexWriter(d, analyzer, True, IndexWriter.MaxFieldLength(512))
    for f in files:
        parse_file(f, writer)
    writer.optimize()
    writer.close()
コード例 #5
0
def luceneIndexer(docdir, indir):
    """

         IndexDocuments from a directory

         """

    lucene.initVM()

    DIRTOINDEX = docdir

    INDEXIDR = indir

    indexdir = SimpleFSDirectory(File(INDEXIDR))

    analyzer = StandardAnalyzer(Version.LUCENE_30)

    index_writer= IndexWriter(indexdir,analyzer,True,\

    IndexWriter.MaxFieldLength(512))

    for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')):

        print "Indexing: ", tfile

        document = Document()

        content = open(tfile, 'r').read()

        document.add(Field("text",content,Field.Store.YES,\

                 Field.Index.ANALYZED))

        index_writer.addDocument(document)

        print "Done: ", tfile

    index_writer.optimize()

    print index_writer.numDocs()

    index_writer.close()
コード例 #6
0
    def addDocuments(self, dir, maxFieldLength):

        writer = IndexWriter(dir, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength(maxFieldLength))
        
        for keyword, unindexed, unstored, text in \
                izip(self.keywords, self.unindexed, self.unstored, self.text):
            doc = Document()
            doc.add(Field("id", keyword,
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(Field("country", unindexed,
                          Field.Store.YES, Field.Index.NO))
            doc.add(Field("contents", unstored,
                          Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("city", text,
                          Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
コード例 #7
0
    def index(self, doc, title, department, url):
        indexdir = SimpleFSDirectory(File(self.indir))
        analyzer = StandardAnalyzer(Version.LUCENE_30)
        index_writer = IndexWriter(indexdir, analyzer, self.init,
                                   IndexWriter.MaxFieldLength(512))
        self.init = False

        # Initialize document and index it
        document = Document()
        document.add(
            Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
        document.add(Field("url", url, Field.Store.YES, Field.Index.ANALYZED))
        document.add(
            Field("department", department, Field.Store.YES,
                  Field.Index.ANALYZED))
        document.add(Field("text", doc, Field.Store.YES, Field.Index.ANALYZED))
        index_writer.addDocument(document)

        index_writer.optimize()
        index_writer.close()
コード例 #8
0
ファイル: indexize.py プロジェクト: wncios/Readers
def luceneIndexer(docdir, indir):
    """frpFile
    IndexDocuments from a directory

    para:{
        docdir: the path of the txt file
        indir: the path of the index file which is generated by the following code
        }
    """

    lucene.initVM()
    DIRTOINDEX = docdir
    INDEXIDR = indir
    indexdir = SimpleFSDirectory(File(INDEXIDR))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    index_writer = IndexWriter(indexdir, analyzer, True, \
                               IndexWriter.MaxFieldLength(512))
    #for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')):
    list = os.listdir(DIRTOINDEX)
    for i in range(len(list)):
        tfile = os.path.join(DIRTOINDEX, list[i])
        if os.path.isfile(tfile):
            print ("Indexing: ", tfile)
            print ('okokokook')
            document = Document()
            content = open(tfile, 'r').read()
            document.add(Field("text", content, Field.Store.YES, \
                               Field.Index.ANALYZED))
            document.add(Field("title",str(tfile.strip('.txt')),Field.Store.YES,\
                               Field.Index.ANALYZED))
            index_writer.addDocument(document)
            #print (document)
            print ("Done: ", tfile)
    index_writer.optimize()
    print (index_writer.numDocs())
    index_writer.close()
コード例 #9
0
 def __init__(self, indir):
     lucene.initVM()
     indexdir = SimpleFSDirectory(File(indir))
     self.index_writer = IndexWriter(indexdir, self.getAnalyzer(), True,
                                     IndexWriter.MaxFieldLength(512))
コード例 #10
0
from common.stats import stats

import lucene
from lucene import \
    SimpleFSDirectory, System, File, \
    Document, Field, StandardAnalyzer, IndexWriter, Version

if __name__ == "__main__":
    lucene.initVM()
    # create an index called 'index-dir' in a temp directory
    #    indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
    #                            'index-dir')
    indexDir = "/Tmp/REMOVEME.index-dir"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    #    # set variables that affect speed of indexing
    #    writer.setMergeFactor(int(argv[2]))
    #    writer.setMaxMergeDocs(int(argv[3]))
    #    writer.setMaxBufferedDocs(int(argv[4]))
    #    # writer.infoStream = System.out
    #
    #    print "Merge factor:  ", writer.getMergeFactor()
    #    print "Max merge docs:", writer.getMaxMergeDocs()
    #    print "Max buffered docs:", writer.getMaxBufferedDocs()

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs(
    )

    i = 0
コード例 #11
0
 def indexFile(self):
     self._th=lucene.initVM()
     self._analyzer = StandardAnalyzer(Version.LUCENE_36)
     self._dir = RAMDirectory()
     self._writer = IndexWriter(self._dir, self._analyzer, True, IndexWriter.MaxFieldLength(25000))
コード例 #12
0
ファイル: pylucene_test.py プロジェクト: danduma/minerva
import lucene
import json

from lucene import (SimpleFSDirectory, System, File, Document, Field,
                    StandardAnalyzer, IndexWriter, IndexSearcher, QueryParser)

if __name__ == "__main__":
    lucene.initVM()
    fullIndexDir = r"c:\NLP\PhD\bob\fileDB\LuceneFullIndex"

    print("lucene version is:", lucene.VERSION)

    fullIndex = SimpleFSDirectory(File(fullIndexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(fullIndex, analyzer, True,
                         IndexWriter.MaxFieldLength(20000000))
    ##    writer = IndexWriter(store, analyzer, True, IndexWriter.MaxFieldLength(512))

    print("Currently there are %d documents in the index..." %
          writer.numDocs())

    ##    print  "Reading lines from sys.stdin..."
    lines = [
        "bla bla bla bla bla", "Erase una vez que se era",
        "En un lugar de La Mancha de cuyo nombre no quiero acordarme, no ha mucho que vivia un hidalgo de los de lanza en ristre",
        "Manchame mancha mancha que te mancha la mancha"
    ]

    for l in lines:
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))