def reindex(self):
     writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.corpus.analyzer, False, IndexWriter.MaxFieldLength.LIMITED)
     indexutils.reindex_all(self.reader, writer, self.corpus.analyzer)
     writer.optimize()
     writer.close()
     self.parent.write({'message': "Reindex successful. Corpus analyzer is now set to %s." % (self.corpus.analyzer_str,)})
     self.parent.write({'status': "Ready!"})
Exemple #2
0
    def removeindex(self, data):
        writer = IndexWriter(
            self.d, self.conf)

        writer.deleteDocuments(lucene.Term("_id", data['record']['_id']))

        writer.optimize()
        writer.close()
Exemple #3
0
    def updateindex(self, data):
        writer = IndexWriter(
            self.d, self.conf)

        doc = self.buildDocument(data['fields'], data['record'])
        writer.updateDocument(lucene.Term("_id", data['record']['_id']), doc)

        writer.optimize()
        writer.close()
Exemple #4
0
    def index(self):
        if not (os.path.exists(self._dataDir)
                and os.path.isdir(self._dataDir)):
            raise IOError, "%s isn't existed or is not a directory" % (
                self._dataDir)

        dir = SimpleFSDirectory(Paths.get(self._indexDir))
        writer = IndexWriter(dir, StandardAnalyzer(), True,
                             IndexWriter.MaxFieldLength.LIMITED)
        writer.setUseCompoundFile(False)
        self.indexDirectory(writer, self._dataDir)
        numIndexed = writer.numDocs()
        writer.optimize()
        writer.close()
        dir.close()

        return numIndexed
Exemple #5
0
    def xmlrpc_indexDocument(self, instance, id, text):
        """Index a new document."""
        self.xmlrpc_unindexDocument(instance, id)

        # Create a document and add two fields to it. 
        doc = Document()
        doc.add(Field('id', id, Field.Store.YES, Field.Index.UN_TOKENIZED))
        doc.add(Field('text', text, Field.Store.YES, Field.Index.TOKENIZED))
        doc.add(Field('instance', instance, Field.Store.YES, Field.Index.UN_TOKENIZED))

        # Write the document into the index.
        writer = IndexWriter(self.indexPath, self.analyzer, 0)
        writer.addDocument(doc)
        writer.optimize()
        writer.close()
        log('Insert: Instance: %s Document: %s' %(instance, id))
        return 1
    def _init_index(self):

        if not os.path.exists(self.corpus.path):
            os.mkdir(self.corpus.path)
        try:
            searcher = IndexSearcher(SimpleFSDirectory(File(self.corpus.path)), True)
        #except lucene.JavaError:
        except:
            analyzer = self.corpus.analyzer
            writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), analyzer, True, IndexWriter.MaxFieldLength.LIMITED)
            writer.setMaxFieldLength(1048576)
            writer.optimize()
            writer.close()

        self.lucene_index = SimpleFSDirectory(File(self.corpus.path))
        self.searcher = IndexSearcher(self.lucene_index, True)
        self.reader = IndexReader.open(self.lucene_index, True)
        self.analyzer = self.corpus.analyzer
Exemple #7
0
class FileIndexer:

    __fileList = []

    def __init__(self, dataDir, fileExtension, indexDir):
        os.path.walk(dataDir, self.__fileSearcher, fileExtension)
        self.__writer = IndexWriter(indexDir, StandardAnalyzer(), True)
        self.__writer.setUseCompoundFile(False)

    def __fileSearcher(self, fileExtension, dirname, filenames):
        #print "Directory:", dirname
        for filename in filenames:
            if filename.split('.')[-1] == fileExtension:
                self.__fileList.append(os.path.join(dirname, filename))

    def getNames(self):
        return self.__fileList

    def indexFiles(self):
        for filename in self.__fileList:
            print filename
            print File(filename).getCanonicalPath()
            doc = Document()
            doc.add(
                Field("contents",
                      open(filename, 'r').read(), Field.Store.YES,
                      Field.Index.TOKENIZED))
            doc.add(
                Field("path",
                      File(filename).getCanonicalPath(), Field.Store.YES,
                      Field.Index.UN_TOKENIZED))
            self.__writer.addDocument(doc)

    def optimizeAndClose(self):
        docCount = self.__writer.docCount()
        self.__writer.optimize()
        self.__writer.close()
        return docCount
Exemple #8
0
# update an existing index (adding a new document).

from java.net import URL
from java.util import Date

# from de.nava.informa.impl.basic import ChannelBuilder
# from de.nava.informa.utils import ChannelRegistry
from de.nava.informa.impl.basic import Item
from de.nava.informa.search import ItemDocument

from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import IndexWriter

# update (3rd arg) index writer in directory (first arg)
start_time = Date()
writer = IndexWriter("index", StandardAnalyzer(), 0)

# create new (dummy) item
item = Item("Informa released", "blubb", URL("http://nava.de/news/2002/06/25"))
item.setFound(Date())

# add new item to index
writer.addDocument(ItemDocument.makeDocument(item))

writer.optimize()
writer.close()
end_time = Date()

print "updating the index took %d milliseconds in total." \
      % (end_time.getTime() - start_time.getTime())
from java.net import URL
from java.util import Date

# from de.nava.informa.impl.basic import ChannelBuilder
# from de.nava.informa.utils import ChannelRegistry
from de.nava.informa.impl.basic import Item
from de.nava.informa.search import ItemDocument

from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import IndexWriter


# update (3rd arg) index writer in directory (first arg)
start_time = Date()
writer = IndexWriter("index", StandardAnalyzer(), 0)

# create new (dummy) item
item = Item("Informa released", "blubb",
            URL("http://nava.de/news/2002/06/25"))
item.setFound(Date())
	    
# add new item to index
writer.addDocument(ItemDocument.makeDocument(item))

writer.optimize()
writer.close()
end_time = Date()

print "updating the index took %d milliseconds in total." \
      % (end_time.getTime() - start_time.getTime())