Ejemplo n.º 1
0
class TermNumerator(Observable):

    def __init__(self, path, lruTaxonomyWriterCacheSize=100):
        Observable.__init__(self)
        taxoDirectory = MMapDirectory(File(path))
        taxoDirectory.setUseUnmap(False)
        self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(lruTaxonomyWriterCacheSize))

    def numerateTerm(self, term):
        if not term:
            return
        return self._taxoWriter.addCategory(FacetLabel([term]))

    def getTerm(self, nr):
        if not hasattr(self, "_taxoReader"):
            self._taxoReader = DirectoryTaxonomyReader(self._taxoWriter)
        tr = DirectoryTaxonomyReader.openIfChanged(self._taxoReader)
        if tr:
            self._taxoReader.close()
            self._taxoReader = tr
        return self._taxoReader.getPath(nr).components[0]

    def handleShutdown(self):
        print 'handle shutdown: saving TermNumerator'
        from sys import stdout; stdout.flush()
        self._taxoWriter.commit()

    def close(self):
        self._taxoWriter.close()
Ejemplo n.º 2
0
class IndexAndTaxonomy(object):

    def __init__(self, settings, indexDirectory=None, taxoDirectory=None):
        self._settings = settings
        self._similarity = settings.similarity
        self._numberOfConcurrentTasks = settings.numberOfConcurrentTasks
        self._reader = DirectoryReader.open(indexDirectory)
        self.taxoReader = DirectoryTaxonomyReader(taxoDirectory)
        self._readerSettingsWrapper = ReaderSettingsWrapper()
        self._readerSettingsWrapper.get = lambda: {"similarity": self.searcher.getSimilarity().toString(), "numberOfConcurrentTasks": self._numberOfConcurrentTasks}
        self._readerSettingsWrapper.set = self._setReadSettings
        self._searcher = None
        self._executor = None
        self._reopenSearcher = True

    def reopen(self):
        reader = DirectoryReader.openIfChanged(self._reader)
        if reader is None:
            return
        self._reader.close()
        self._reader = reader
        self._reopenSearcher = True
        taxoReader = DirectoryTaxonomyReader.openIfChanged(self.taxoReader)
        if taxoReader is None:
            return
        self.taxoReader.close()
        self.taxoReader = taxoReader

    @property
    def searcher(self):
        if not self._reopenSearcher:
            return self._searcher

        if self._settings.multithreaded:
            if self._executor:
                self._executor.shutdown();
            self._executor = Executors.newFixedThreadPool(self._numberOfConcurrentTasks);
            self._searcher = SuperIndexSearcher(self._reader, self._executor, self._numberOfConcurrentTasks)
        else:
            self._searcher = IndexSearcher(self._reader)
        self._searcher.setSimilarity(self._similarity)
        self._reopenSearcher = False
        return self._searcher

    def _setReadSettings(self, similarity=None, numberOfConcurrentTasks=None):
        # This method must be thread-safe
        if similarity is None:
            self._similarity = self._settings.similarity
        else:
            self._similarity = BM25Similarity(similarity["k1"], similarity["b"])

        if numberOfConcurrentTasks is None:
            self._numberOfConcurrentTasks = self._settings.numberOfConcurrentTasks
        else:
            self._numberOfConcurrentTasks = numberOfConcurrentTasks
        self._reopenSearcher = True

    def close(self):
        self.taxoReader.close()
        self._reader.close()
Ejemplo n.º 3
0
 def getTerm(self, nr):
     if not hasattr(self, "_taxoReader"):
         self._taxoReader = DirectoryTaxonomyReader(self._taxoWriter)
     tr = DirectoryTaxonomyReader.openIfChanged(self._taxoReader)
     if tr:
         self._taxoReader.close()
         self._taxoReader = tr
     return self._taxoReader.getPath(nr).components[0]
Ejemplo n.º 4
0
 def runDrillDown(self):
     # open readers
     taxo = DirectoryTaxonomyReader(self.taxoDir)
     indexReader = DirectoryReader.open(self.indexDir)
     facetRes = SimpleSearcher.searchWithDrillDown(indexReader, taxo)
     # close readers
     taxo.close()
     indexReader.close()
     # return result
     return facetRes
Ejemplo n.º 5
0
 def runSimple(self):
     # open readers
     taxo = DirectoryTaxonomyReader(self.taxoDir)
     indexReader = DirectoryReader.open(self.indexDir)
     # returns List<FacetResult>
     facetRes = SimpleSearcher.searchWithFacets(indexReader, taxo)
     # close readers
     taxo.close()
     indexReader.close()
     # return result
     return facetRes
Ejemplo n.º 6
0
    def runDrillDown(self):
        # open readers
        taxo = DirectoryTaxonomyReader(self.taxoDir)
        indexReader = DirectoryReader.open(self.indexDir)

        for drilldown in drilldownCategories:
            print "search with drilldown: %s" %  '/'.join(drilldown)
            facetRes = SimpleSearcher.searchWithDrillDown(drilldown, indexReader,
                                                          taxo, self.facets_config)
        # close readers
        taxo.close()
        indexReader.close()
        # return result
        return facetRes
Ejemplo n.º 7
0
 def __init__(self, settings, indexDirectory=None, taxoDirectory=None):
     self._settings = settings
     self._similarity = settings.similarity
     self._numberOfConcurrentTasks = settings.numberOfConcurrentTasks
     self._reader = DirectoryReader.open(indexDirectory)
     self.taxoReader = DirectoryTaxonomyReader(taxoDirectory)
     self._readerSettingsWrapper = ReaderSettingsWrapper()
     self._readerSettingsWrapper.get = lambda: {
         "similarity": self.searcher.getSimilarity().toString(),
         "numberOfConcurrentTasks": self._numberOfConcurrentTasks
     }
     self._readerSettingsWrapper.set = self._setReadSettings
     self._searcher = None
     self._executor = None
     self._reopenSearcher = True
Ejemplo n.º 8
0
    def runSimple(self):
        # open readers
        taxo = DirectoryTaxonomyReader(self.taxoDir)
        indexReader = DirectoryReader.open(self.indexDir)

        for term in searchValues:
            print  "\nsearch by term '%s' ..." % term
            facetRes = SimpleSearcher.searchWithTerm(term, indexReader, taxo,
                                                       self.facets_config)
        print  "\nsearch all documents  ..."
        facetRes = SimpleSearcher.searchWithFacets(indexReader, taxo,
                                                   self.facets_config)
        # close readers
        taxo.close()
        indexReader.close()
        # return result
        return facetRes
Ejemplo n.º 9
0
 def reopen(self):
     reader = DirectoryReader.openIfChanged(self._reader)
     if reader is None:
         return
     self._reader.close()
     self._reader = reader
     self._reopenSearcher = True
     taxoReader = DirectoryTaxonomyReader.openIfChanged(self.taxoReader)
     if taxoReader is None:
         return
     self.taxoReader.close()
     self.taxoReader = taxoReader
Ejemplo n.º 10
0
 def reopen(self):
     reader = DirectoryReader.openIfChanged(self._reader)
     if reader is None:
         return
     self._reader.close()
     self._reader = reader
     self._reopenSearcher = True
     taxoReader = DirectoryTaxonomyReader.openIfChanged(self.taxoReader)
     if taxoReader is None:
         return
     self.taxoReader.close()
     self.taxoReader = taxoReader
Ejemplo n.º 11
0
 def __init__(self, settings, indexDirectory=None, taxoDirectory=None):
     self._settings = settings
     self._similarity = settings.similarity
     self._numberOfConcurrentTasks = settings.numberOfConcurrentTasks
     self._reader = DirectoryReader.open(indexDirectory)
     self.taxoReader = DirectoryTaxonomyReader(taxoDirectory)
     self._readerSettingsWrapper = ReaderSettingsWrapper()
     self._readerSettingsWrapper.get = lambda: {"similarity": self.searcher.getSimilarity().toString(), "numberOfConcurrentTasks": self._numberOfConcurrentTasks}
     self._readerSettingsWrapper.set = self._setReadSettings
     self._searcher = None
     self._executor = None
     self._reopenSearcher = True
Ejemplo n.º 12
0
class IndexAndTaxonomy(object):
    def __init__(self, settings, indexDirectory=None, taxoDirectory=None):
        self._settings = settings
        self._similarity = settings.similarity
        self._numberOfConcurrentTasks = settings.numberOfConcurrentTasks
        self._reader = DirectoryReader.open(indexDirectory)
        self.taxoReader = DirectoryTaxonomyReader(taxoDirectory)
        self._readerSettingsWrapper = ReaderSettingsWrapper()
        self._readerSettingsWrapper.get = lambda: {
            "similarity": self.searcher.getSimilarity().toString(),
            "numberOfConcurrentTasks": self._numberOfConcurrentTasks
        }
        self._readerSettingsWrapper.set = self._setReadSettings
        self._searcher = None
        self._executor = None
        self._reopenSearcher = True

    def reopen(self):
        reader = DirectoryReader.openIfChanged(self._reader)
        if reader is None:
            return
        self._reader.close()
        self._reader = reader
        self._reopenSearcher = True
        taxoReader = DirectoryTaxonomyReader.openIfChanged(self.taxoReader)
        if taxoReader is None:
            return
        self.taxoReader.close()
        self.taxoReader = taxoReader

    @property
    def searcher(self):
        if not self._reopenSearcher:
            return self._searcher

        if self._settings.multithreaded:
            if self._executor:
                self._executor.shutdown()
            self._executor = Executors.newFixedThreadPool(
                self._numberOfConcurrentTasks)
            self._searcher = SuperIndexSearcher(self._reader, self._executor,
                                                self._numberOfConcurrentTasks)
        else:
            self._searcher = IndexSearcher(self._reader)
        self._searcher.setSimilarity(self._similarity)
        self._reopenSearcher = False
        return self._searcher

    def _setReadSettings(self, similarity=None, numberOfConcurrentTasks=None):
        # This method must be thread-safe
        if similarity is None:
            self._similarity = self._settings.similarity
        else:
            self._similarity = BM25Similarity(similarity["k1"],
                                              similarity["b"])

        if numberOfConcurrentTasks is None:
            self._numberOfConcurrentTasks = self._settings.numberOfConcurrentTasks
        else:
            self._numberOfConcurrentTasks = numberOfConcurrentTasks
        self._reopenSearcher = True

    def close(self):
        self.taxoReader.close()
        self._reader.close()