Beispiel #1
0
    def index(self):
        if not (os.path.exists(self._dataDir)
                and os.path.isdir(self._dataDir)):
            raise IOError, "%s isn't existed or is not a directory" % (
                self._dataDir)

        dir = SimpleFSDirectory(Paths.get(self._indexDir))
        writer = IndexWriter(dir, StandardAnalyzer(), True,
                             IndexWriter.MaxFieldLength.LIMITED)
        writer.setUseCompoundFile(False)
        self.indexDirectory(writer, self._dataDir)
        numIndexed = writer.numDocs()
        writer.optimize()
        writer.close()
        dir.close()

        return numIndexed
Beispiel #2
0
class FileIndexer:

    __fileList = []

    def __init__(self, dataDir, fileExtension, indexDir):
        os.path.walk(dataDir, self.__fileSearcher, fileExtension)
        self.__writer = IndexWriter(indexDir, StandardAnalyzer(), True)
        self.__writer.setUseCompoundFile(False)

    def __fileSearcher(self, fileExtension, dirname, filenames):
        #print "Directory:", dirname
        for filename in filenames:
            if filename.split('.')[-1] == fileExtension:
                self.__fileList.append(os.path.join(dirname, filename))

    def getNames(self):
        return self.__fileList

    def indexFiles(self):
        for filename in self.__fileList:
            print filename
            print File(filename).getCanonicalPath()
            doc = Document()
            doc.add(
                Field("contents",
                      open(filename, 'r').read(), Field.Store.YES,
                      Field.Index.TOKENIZED))
            doc.add(
                Field("path",
                      File(filename).getCanonicalPath(), Field.Store.YES,
                      Field.Index.UN_TOKENIZED))
            self.__writer.addDocument(doc)

    def optimizeAndClose(self):
        docCount = self.__writer.docCount()
        self.__writer.optimize()
        self.__writer.close()
        return docCount