def test_persistentCacheUnderLoad(self):
        cppView1 = CumulusNative.PersistentCacheIndex(
            self.sharedState.newView(), callbackScheduler)

        t0 = time.time()
        #add 100k pages, which is enough for about 5 TB of data
        for index in range(100000):
            if index % 1000 == 0 and index > 0:
                print index, (time.time() - t0) / (index /
                                                   1000.0), " seconds per 1000"
            cppView1.addPage(sha1("page" + str(index)), HashSet(), 1, sha1(""))

        print "took ", time.time() - t0, " to add 100k."

        t1 = time.time()

        bytes0 = TCMallocNative.getBytesUsed()

        cppView2 = CumulusNative.PersistentCacheIndex(
            self.sharedState.newView(), callbackScheduler)

        while cppView2.totalBytesInCache() < 100000:
            time.sleep(.1)
            print cppView2.totalBytesInCache()

        print "took ", time.time() - t1, " to load 100k. Total RAM is ", (TCMallocNative.getBytesUsed() - bytes0) / 1024 / 1024.0, " MB per view"\
Example #2
0
def createWorker(machineId,
                 viewFactory,
                 callbackSchedulerToUse=None,
                 threadCount=2,
                 memoryLimitMb=100):
    if callbackSchedulerToUse is None:
        callbackSchedulerToUse = CallbackScheduler.singletonForTesting()

    vdm = ForaNative.VectorDataManager(callbackSchedulerToUse, 5 * 1024 * 1024)
    vdm.setMemoryLimit(
        int(memoryLimitMb * 1024 * 1024),
        min(int(memoryLimitMb * 1.25 * 1024 * 1024),
            int((memoryLimitMb + 1024 * 2) * 1024 * 1024)))

    vdm.setPersistentCacheIndex(
        CumulusNative.PersistentCacheIndex(viewFactory.createView(),
                                           callbackSchedulerToUse))

    cache = CumulusNative.SimpleOfflineCache(callbackSchedulerToUse,
                                             1000 * 1024 * 1024)

    eventHandler = CumulusNative.CumulusWorkerHoldEventsInMemoryEventHandler()

    return (CumulusNative.CumulusWorker(
        callbackSchedulerToUse,
        CumulusNative.CumulusWorkerConfiguration(
            machineId, threadCount,
            CumulusNative.CumulusCheckpointPolicy.None (),
            ExecutionContext.createContextConfiguration(), ""), vdm, cache,
        eventHandler), vdm, eventHandler)
    def circularPageReferenceTest(self, shouldBeInvalid):
        cppView1 = CumulusNative.PersistentCacheIndex(
            self.sharedState.newView(), callbackScheduler)
        computationId = CumulusNative.ComputationId.Root(sha1("computation"))

        checkpointRequest = CumulusNative.CheckpointRequest(
            0.0, True, computationId)

        cppView1.addBigvec(sha1("bigvec1"),
                           HashSet() + sha1("page1"), 2, sha1(""))
        cppView1.addPage(
            sha1("page1"),
            (HashSet() + sha1("bigvec1")) if shouldBeInvalid else HashSet(), 1,
            sha1(""))
        cppView1.addCheckpointFile(checkpointRequest, sha1("file"),
                                   HashSet() + sha1("bigvec1"), 2, sha1(""))
        cppView1.addCheckpoint(checkpointRequest,
                               HashSet() + sha1("file"), 2, sha1(""), True,
                               1.0, HashSet())

        self.assertTrue(
            len(cppView1.computeInvalidObjects()) == (4 if shouldBeInvalid else
                                                      0),
            "%s != %s" % (len(cppView1.computeInvalidObjects()),
                          (4 if shouldBeInvalid else 0)))
    def test_writing_while_disconnected(self):
        currentView = [self.sharedState.newView()]

        cppView1 = CumulusNative.PersistentCacheIndex(currentView[0],
                                                      callbackScheduler)

        def writeInLoop():
            for ix in range(100):
                time.sleep(0.01)
                cppView1.addPage(sha1("page" + str(ix)), HashSet(), ix,
                                 sha1(""))

        thread1 = threading.Thread(target=writeInLoop)
        thread1.start()

        def disconnectAndReconnectInLoop():
            ix = 0
            while thread1.isAlive():
                ix += 1
                time.sleep(0.004)
                currentView[0].disconnect()
                currentView[0] = self.sharedState.newView()
                cppView1.resetView(currentView[0])

        thread2 = threading.Thread(target=disconnectAndReconnectInLoop)
        thread2.start()

        thread1.join()
        thread2.join()

        self.assertTrue(cppView1.timesViewReconnected() > 10)

        cppView2 = CumulusNative.PersistentCacheIndex(
            self.sharedState.newView(), callbackScheduler)

        time.sleep(2.0)

        count1 = 0
        count2 = 0
        for ix in range(100):
            if cppView1.pageExists(sha1("page" + str(ix))):
                count1 += 1

            if cppView2.pageExists(sha1("page" + str(ix))):
                count2 += 1

        self.assertTrue(count1 == 100 and count2 == 100, (count1, count2))
def createWorker_(machineId,
                  viewFactory,
                  callbackSchedulerToUse,
                  threadCount,
                  memoryLimitMb,
                  cacheFunction,
                  pageSizeOverride,
                  disableEventHandler):
    if callbackSchedulerToUse is None:
        callbackSchedulerToUse = CallbackScheduler.singletonForTesting()

    vdm = ForaNative.VectorDataManager(
        callbackSchedulerToUse,
        pageSizeOverride if pageSizeOverride is not None else
        1 * 1024 * 1024 if memoryLimitMb < 1000 else
        5 * 1024 * 1024 if memoryLimitMb < 5000 else
        50 * 1024 * 1024
        )

    vdm.setMemoryLimit(
        int(memoryLimitMb * 1024 * 1024),
        min(int(memoryLimitMb * 1.25 * 1024 * 1024),
            int((memoryLimitMb + 1024 * 2) * 1024 * 1024))
        )

    vdm.setPersistentCacheIndex(
        CumulusNative.PersistentCacheIndex(
            viewFactory.createView(),
            callbackSchedulerToUse
            )
        )

    cache = cacheFunction()

    if disableEventHandler:
        eventHandler = CumulusNative.CumulusWorkerIgnoreEventHandler()
    else:
        eventHandler = CumulusNative.CumulusWorkerHoldEventsInMemoryEventHandler()

    return (
        CumulusNative.CumulusWorker(
            callbackSchedulerToUse,
            CumulusNative.CumulusWorkerConfiguration(
                machineId,
                threadCount,
                CumulusNative.CumulusCheckpointPolicy.None(),
                ExecutionContext.createContextConfiguration(),
                ""
                ),
            vdm,
            cache,
            eventHandler
            ),
        vdm,
        eventHandler
        )
Example #6
0
    def __init__(self, callbackScheduler, vdm, viewFactory):
        Stoppable.Stoppable.__init__(self)

        self.lock_ = threading.Lock()
        self.callbackScheduler = callbackScheduler
        self.definitionToIdMap_ = {}
        self.idToDefinitionMap_ = {}
        self.vdm = vdm

        self.onJsonViewOfSystemChanged = None

        self.persistentCacheIndex = CumulusNative.PersistentCacheIndex(
            viewFactory.createView(retrySeconds=10.0, numRetries=10),
            callbackScheduler)

        self.vdm.setPersistentCacheIndex(self.persistentCacheIndex)

        self.cumulusClientId = CumulusNative.CumulusClientId(
            Hash.Hash.sha1(str(uuid.uuid4())))

        logging.info("CumulusClient created with %s", self.cumulusClientId)

        self.cumulusClient = CumulusNative.CumulusClient(
            vdm, self.cumulusClientId, self.callbackScheduler)

        self.finalResponses = Queue.Queue()

        self.cumulusClientListener = self.cumulusClient.createListener()

        self.cpuAssignmentDependencyGraph = CumulusNative.CpuAssignmentDependencyGraph(
            self.callbackScheduler.getFactory().createScheduler(
                self.callbackScheduler.getMetadata() + "_cpuAssignmentGraph",
                1), self.vdm)
        self.cpuAssignmentDependencyGraph.subscribeToCumulusClient(
            self.cumulusClient)

        self.pendingCallbacksByGuid = {}

        self.cpuAssignmentDependencyGraphListener = \
            self.cpuAssignmentDependencyGraph.createListener()

        self.threads = []

        self.threads.append(
            ManagedThread.ManagedThread(target=self.processClientMessages_,
                                        args=()))
        self.threads.append(
            ManagedThread.ManagedThread(
                target=self.processDependencyGraphMessages_, args=()))

        for t in self.threads:
            t.start()

        self.nextCpuUpdateTime = time.time()
        self.cpuMessagesSinceLastUpdate = 0
        self.lastSystemwideUpdateTime = time.time()
    def test_basicPersistentCache(self):
        cppView1 = CumulusNative.PersistentCacheIndex(
            self.sharedState.newView(), callbackScheduler)

        cppView2 = CumulusNative.PersistentCacheIndex(
            self.sharedState.newView(), callbackScheduler)

        cppView1.addPage(sha1("page1"), HashSet(), 1, sha1(""))
        cppView1.addBigvec(sha1("bigvec1"),
                           HashSet() + sha1("page1"), 2, sha1(""))
        cppView1.addPage(sha1("page2"),
                         HashSet() + sha1("bigvec1"), 3, sha1(""))
        cppView1.addBigvec(sha1("bigvec2"),
                           HashSet() + sha1("page2"), 4, sha1(""))

        self.assertEqual(cppView1.totalBytesInCache(), 10)

        def seesEverything():
            if not cppView2.pageExists(sha1("page1")):
                return False
            if not cppView2.pageExists(sha1("page2")):
                return False
            if not cppView2.bigvecExists(sha1("bigvec1")):
                return False
            if not cppView2.bigvecExists(sha1("bigvec2")):
                return False

            return True

        self.waitForSync(seesEverything)

        for view in [cppView1, cppView2]:
            self.assertEqual(view.pageBytecount(sha1("page1")), 1)
            self.assertEqual(view.bigvecBytecount(sha1("bigvec1")), 2)
            self.assertEqual(view.pageBytecount(sha1("page2")), 3)
            self.assertEqual(view.bigvecBytecount(sha1("bigvec2")), 4)
            self.assertEqual(view.totalBytesInCache(), 10)
Example #8
0
    def __init__(self,
                 ownAddress,
                 channelListener,
                 channelFactory,
                 eventHandler,
                 callbackScheduler,
                 diagnosticsDir,
                 config,
                 viewFactory,
                 s3InterfaceFactory=None,
                 objectStore=None):
        Stoppable.Stoppable.__init__(self)

        #acquire a machineId randomly, using uuid
        self.machineId = CumulusNative.MachineId(
            Hash.Hash.sha1(str(uuid.uuid4()))
            )

        self.ownAddress = ownAddress
        self.callbackScheduler = callbackScheduler
        self.viewFactory = viewFactory
        self.s3InterfaceFactory = s3InterfaceFactory
        self.objectStore = objectStore
        self.threadsStarted_ = False
        self.connectedMachines = set()
        self.connectingMachines = set()  # machines we are in the process of connecting to
        self.droppedMachineIds = set()
        self.lock = threading.RLock()
        self.cumulusMaxRamCacheSizeOverride = config.cumulusMaxRamCacheMB * 1024*1024
        self.cumulusVectorRamCacheSizeOverride = config.cumulusVectorRamCacheMB * 1024*1024
        self.cumulusThreadCountOverride = config.cumulusServiceThreadCount
        self.cumulusTrackTcmalloc = config.cumulusTrackTcmalloc
        self.eventHandler = eventHandler

        self.reconnectPersistentCacheIndexViewThreads = []

        if config.cumulusDiskCacheStorageSubdirectory is not None:
            self.cumulusDiskCacheWantsDeletionOnTeardown = True
            self.cumulusDiskCacheStorageDir = os.path.join(
                config.cumulusDiskCacheStorageDir,
                config.cumulusDiskCacheStorageSubdirectory
                )
        else:
            self.cumulusDiskCacheWantsDeletionOnTeardown = False
            self.cumulusDiskCacheStorageDir = config.cumulusDiskCacheStorageDir

        self._stopEvent = threading.Event()

        self._channelListener = channelListener
        assert len(self._channelListener.ports) == 2
        self._channelFactory = channelFactory

        Runtime.initialize()
        ModuleImporter.initialize()

        self.cumulusActiveMachines = CumulusActiveMachines.CumulusActiveMachines(
            self.viewFactory
            )

        self.cumulusChannelFactoryThread = ManagedThread.ManagedThread(
            target=self._channelListener.start
            )

        self.vdm = VectorDataManager.constructVDM(
            callbackScheduler,
            self.cumulusVectorRamCacheSizeOverride,
            self.cumulusMaxRamCacheSizeOverride
            )

        if self.cumulusTrackTcmalloc:
            self.vdm.getMemoryManager().enableCountTcMallocMemoryAsEcMemory()

        self.persistentCacheIndex = CumulusNative.PersistentCacheIndex(
            viewFactory.createView(retrySeconds=10.0, numRetries=10),
            callbackScheduler
            )

        self.vdm.setPersistentCacheIndex(self.persistentCacheIndex)

        self.deleteCumulusDiskCacheIfNecessary()

        self.offlineCache = CumulusNative.DiskOfflineCache(
            callbackScheduler,
            self.cumulusDiskCacheStorageDir,
            config.cumulusDiskCacheStorageMB * 1024 * 1024,
            config.cumulusDiskCacheStorageFileCount
            )

        #If the "s3InterfaceFactory" is not in-memory, we use real out of process python.
        #it would be better if this were more explicit
        outOfProcess = self.s3InterfaceFactory is not None and self.s3InterfaceFactory.isCompatibleWithOutOfProcessDownloadPool

        self.outOfProcessPythonTasks = OutOfProcessPythonTasks.OutOfProcessPythonTasks(outOfProcess=outOfProcess)

        self.vdm.initializeOutOfProcessPythonTasks(self.outOfProcessPythonTasks.nativeTasks)

        checkpointInterval = config.cumulusCheckpointIntervalSeconds
        if checkpointInterval == 0:
            checkpointPolicy = CumulusNative.CumulusCheckpointPolicy.None()
        else:
            checkpointPolicy = CumulusNative.CumulusCheckpointPolicy.Periodic(
                checkpointInterval,
                1024 * 1024
                )

        self.cumulusWorker = self.constructCumlusWorker(
            callbackScheduler,
            CumulusNative.CumulusWorkerConfiguration(
                self.machineId,
                self.cumulusThreadCountOverride,
                checkpointPolicy,
                ExecutionContext.createContextConfiguration(),
                diagnosticsDir or ""
                ),
            self.vdm,
            self.offlineCache,
            eventHandler
            )

        self.datasetLoadService = None
        if self.s3InterfaceFactory:
            externalDatasetChannel = self.cumulusWorker.getExternalDatasetRequestChannel(
                callbackScheduler
                )
            self.datasetLoadService = PythonIoTaskService.PythonIoTaskService(
                self.s3InterfaceFactory,
                self.objectStore,
                self.vdm,
                externalDatasetChannel.makeQueuelike(callbackScheduler)
                )

        self.cumulusWorker.startComputations()

        if self.datasetLoadService:
            self.datasetLoadService.startService()
Example #9
0
    def __init__(self, ownAddress, channelListener, channelFactory,
                 eventHandler, callbackScheduler, diagnosticsDir, config,
                 viewFactory):
        Stoppable.Stoppable.__init__(self)

        #acquire a machineId randomly, using uuid
        self.machineId = CumulusNative.MachineId(
            Hash.Hash.sha1(str(uuid.uuid4())))

        self.ownAddress = ownAddress
        self.callbackScheduler = callbackScheduler
        self.viewFactory = viewFactory
        self.threadsStarted_ = False
        self.connectedMachines = set()
        self.connectingMachines = set(
        )  # machines we are in the process of connecting to
        self.droppedMachineIds = set()
        self.lock = threading.RLock()
        self.cumulusMaxRamCacheSizeOverride = config.cumulusMaxRamCacheMB * 1024 * 1024
        self.cumulusVectorRamCacheSizeOverride = config.cumulusVectorRamCacheMB * 1024 * 1024
        self.cumulusThreadCountOverride = config.cumulusServiceThreadCount
        self.cumulusTrackTcMalloc = config.cumulusTrackTcmalloc

        self.reconnectPersistentCacheIndexViewThreads = []

        if config.cumulusDiskCacheStorageSubdirectory is not None:
            self.cumulusDiskCacheWantsDeletionOnTeardown = True
            self.cumulusDiskCacheStorageDir = os.path.join(
                config.cumulusDiskCacheStorageDir,
                config.cumulusDiskCacheStorageSubdirectory)
        else:
            self.cumulusDiskCacheWantsDeletionOnTeardown = False
            self.cumulusDiskCacheStorageDir = config.cumulusDiskCacheStorageDir

        logging.info(
            "Creating a CumulusService with ram cache of %s / %s MB and %s threads",
            self.cumulusVectorRamCacheSizeOverride / 1024.0 / 1024.0,
            self.cumulusMaxRamCacheSizeOverride / 1024.0 / 1024.0,
            self.cumulusThreadCountOverride)

        self._stopEvent = threading.Event()

        self._channelListener = channelListener
        assert len(self._channelListener.ports) == 2
        self._channelFactory = channelFactory

        Runtime.initialize()
        ModuleImporter.initialize()

        self.cumulusActiveMachines = CumulusActiveMachines.CumulusActiveMachines(
            self.viewFactory)

        self.cumulusChannelFactoryThread = ManagedThread.ManagedThread(
            target=self._channelListener.start)

        self.vdm = VectorDataManager.constructVDM(
            callbackScheduler, self.cumulusVectorRamCacheSizeOverride,
            self.cumulusMaxRamCacheSizeOverride)

        if self.cumulusTrackTcMalloc:
            logging.info(
                "CumulusService enabling track-tc-malloc memory with a max cache of %s MB",
                self.cumulusMaxRamCacheSizeOverride / 1024 / 1024.0)
            self.vdm.getMemoryManager().enableCountTcMallocMemoryAsEcMemory()

        self.persistentCacheIndex = CumulusNative.PersistentCacheIndex(
            viewFactory.createView(retrySeconds=10.0, numRetries=10),
            callbackScheduler)

        self.vdm.setPersistentCacheIndex(self.persistentCacheIndex)

        self.deleteCumulusDiskCacheIfNecessary()

        self.offlineCache = CumulusNative.DiskOfflineCache(
            callbackScheduler, self.cumulusDiskCacheStorageDir,
            config.cumulusDiskCacheStorageMB * 1024 * 1024,
            config.cumulusDiskCacheStorageFileCount)

        checkpointInterval = config.cumulusCheckpointIntervalSeconds
        if checkpointInterval == 0:
            checkpointPolicy = CumulusNative.CumulusCheckpointPolicy.None ()
        else:
            checkpointPolicy = CumulusNative.CumulusCheckpointPolicy.Periodic(
                checkpointInterval, 1024 * 1024)

        self.cumulusWorker = self.constructCumlusWorker(
            callbackScheduler,
            CumulusNative.CumulusWorkerConfiguration(
                self.machineId,
                self.cumulusThreadCountOverride, checkpointPolicy,
                ExecutionContext.createContextConfiguration(), diagnosticsDir
                or ""), self.vdm, self.offlineCache, eventHandler)

        #externalDatasetChannel = self.cumulusWorker.getExternalDatasetRequestChannel(
        #callbackScheduler
        #)
        #self.datasetLoadService = PythonIoTaskService.PythonIoTaskService(
        #settings.s3InterfaceFactory,
        #settings.objectStore,
        #self.vdm,
        #externalDatasetChannel.makeQueuelike(callbackScheduler)
        #)

        self.cumulusWorker.startComputations()
 def test_orphanedPageIsCollected(self):
     cppView1 = CumulusNative.PersistentCacheIndex(
         self.sharedState.newView(), callbackScheduler)
     cppView1.addPage(sha1("page1"), HashSet(), 1, sha1(""))
     self.assertTrue(len(cppView1.computeInvalidObjects()) == 1)