Example #1
0
    def onWorkerAdd(self, ip, ports, machineIdAsString):
        machineId = CumulusNative.MachineId(Hash.Hash.stringToHash(machineIdAsString))

        if machineId <= self.machineId:
            logging.info("Worker %s detected worker %s, and waiting for incoming connection",
                         self.machineId,
                         machineId)

            #only connect one way. If the worker is larger than us, then we connect to it
            return

        guid = Hash.Hash.sha1(str(uuid.uuid4()))

        logging.info(
            "Worker %s detected worker %s and initiating connection with guid %s",
            self.machineId,
            machineId,
            guid
            )

        with self.lock:
            # Track that we are trying to connect to this machine
            self.connectingMachines.add(machineId)

        ManagedThread.ManagedThread(
            target=self.onWorkerAdd2, args=(machineId, ip, ports, guid)
            ).start()
Example #2
0
 def logBadUforaVersionOnChannel(self, version):
     try:
         anId = CumulusNative.MachineId(Hash.Hash(0))
         anId.__setstate__(version)
         logging.error(
             "CumulusService %s received a bad version message that is, " \
                 "in fact, a machineId: %s",
             self.machineId,
             anId
             )
     except:
         logging.error(
             "CumulusService %s received a bad version message that is not a machineId: %s",
             self.machineId, repr(version))
Example #3
0
    def onWorkerDrop(self, machineIdAsString):
        with self.lock_:
            machineId = CumulusNative.MachineId(
                HashNative.Hash.stringToHash(machineIdAsString))

            self.disconnectedMachines_.add(machineId)

            if machineId in self.desiredMachines_:
                self.desiredMachines_.discard(machineId)

            if machineId not in self.connectedMachines_:
                return

            self.connectedMachines_.discard(machineId)

            if len(self.connectedMachines_) == 0:
                self.onMachineCountWentToZero()

            self.cumulusClient.dropMachine(machineId)
Example #4
0
    def onWorkerAdd(self, ip, ports, machineIdAsString):
        machineId = CumulusNative.MachineId(
            HashNative.Hash.stringToHash(machineIdAsString))

        with self.lock_:
            if self.isTornDown_:
                return

            logging.info("CumulusClient %s preparing to connect to %s",
                         self.cumulusClientId, machineId)
            self.desiredMachines_.add(machineId)

            newThread = ManagedThread.ManagedThread(
                target=self.addDesiredMachine, args=(machineId, ip, ports))
            self.connectingThreads_.append(newThread)
            self.connectingThreads_ = [
                x for x in self.connectingThreads_ if x.isAlive()
            ]

            newThread.start()
Example #5
0
    def isOwnHashInHandshakeMessage(self, message):
        if message is None:
            logging.error(
                "CumulusService %s didn't receive own Id in handshake.",
                self.machineId)
            return False

        try:
            machineId = CumulusNative.MachineId(Hash.Hash(0))
            machineId.__setstate__(message)
        except:
            machineId = "not a valid machine ID"

        if isinstance(machineId, str) or machineId != self.machineId:
            logging.error(
                "CumulusWorker %s received connection intended for another machine (%s). %s != %s",
                self.machineId, machineId, repr(message),
                repr(self.machineId.__getstate__()))
            return False
        return True
Example #6
0
    def onWorkerDrop(self, machineIdAsString):
        machineId = CumulusNative.MachineId(Hash.Hash.stringToHash(machineIdAsString))

        if machineId == self.machineId:
            return

        logging.info("CumulusService %s dropped worker %s", self.machineId, machineId)

        try:
            hadMachine = False
            with self.lock:
                if machineId in self.connectedMachines:
                    hadMachine = True
                self.connectingMachines.discard(machineId)
                self.connectedMachines.discard(machineId)
                self.droppedMachineIds.add(machineId)

            if hadMachine:
                self.cumulusWorker.dropMachine(machineId)
        except:
            logging.error("Failed to drop worker: %s", traceback.format_exc())
            raise
def machineId(ix, seed = None):
    h = HashNative.Hash(ix)
    if seed is not None:
        h = h + HashNative.Hash.sha1(seed)

    return CumulusNative.MachineId(h)
Example #8
0
def machineId(ix):
    return CumulusNative.MachineId(HashNative.Hash(ix))
Example #9
0
    def __init__(self,
                 ownAddress,
                 channelListener,
                 channelFactory,
                 eventHandler,
                 callbackScheduler,
                 diagnosticsDir,
                 config,
                 viewFactory,
                 s3InterfaceFactory=None,
                 objectStore=None):
        Stoppable.Stoppable.__init__(self)

        #acquire a machineId randomly, using uuid
        self.machineId = CumulusNative.MachineId(
            Hash.Hash.sha1(str(uuid.uuid4()))
            )

        self.ownAddress = ownAddress
        self.callbackScheduler = callbackScheduler
        self.viewFactory = viewFactory
        self.s3InterfaceFactory = s3InterfaceFactory
        self.objectStore = objectStore
        self.threadsStarted_ = False
        self.connectedMachines = set()
        self.connectingMachines = set()  # machines we are in the process of connecting to
        self.droppedMachineIds = set()
        self.lock = threading.RLock()
        self.cumulusMaxRamCacheSizeOverride = config.cumulusMaxRamCacheMB * 1024*1024
        self.cumulusVectorRamCacheSizeOverride = config.cumulusVectorRamCacheMB * 1024*1024
        self.cumulusThreadCountOverride = config.cumulusServiceThreadCount
        self.cumulusTrackTcmalloc = config.cumulusTrackTcmalloc
        self.eventHandler = eventHandler

        self.reconnectPersistentCacheIndexViewThreads = []

        if config.cumulusDiskCacheStorageSubdirectory is not None:
            self.cumulusDiskCacheWantsDeletionOnTeardown = True
            self.cumulusDiskCacheStorageDir = os.path.join(
                config.cumulusDiskCacheStorageDir,
                config.cumulusDiskCacheStorageSubdirectory
                )
        else:
            self.cumulusDiskCacheWantsDeletionOnTeardown = False
            self.cumulusDiskCacheStorageDir = config.cumulusDiskCacheStorageDir

        self._stopEvent = threading.Event()

        self._channelListener = channelListener
        assert len(self._channelListener.ports) == 2
        self._channelFactory = channelFactory

        Runtime.initialize()
        ModuleImporter.initialize()

        self.cumulusActiveMachines = CumulusActiveMachines.CumulusActiveMachines(
            self.viewFactory
            )

        self.cumulusChannelFactoryThread = ManagedThread.ManagedThread(
            target=self._channelListener.start
            )

        self.vdm = VectorDataManager.constructVDM(
            callbackScheduler,
            self.cumulusVectorRamCacheSizeOverride,
            self.cumulusMaxRamCacheSizeOverride
            )

        if self.cumulusTrackTcmalloc:
            self.vdm.getMemoryManager().enableCountTcMallocMemoryAsEcMemory()

        self.persistentCacheIndex = CumulusNative.PersistentCacheIndex(
            viewFactory.createView(retrySeconds=10.0, numRetries=10),
            callbackScheduler
            )

        self.vdm.setPersistentCacheIndex(self.persistentCacheIndex)

        self.deleteCumulusDiskCacheIfNecessary()

        self.offlineCache = CumulusNative.DiskOfflineCache(
            callbackScheduler,
            self.cumulusDiskCacheStorageDir,
            config.cumulusDiskCacheStorageMB * 1024 * 1024,
            config.cumulusDiskCacheStorageFileCount
            )

        #If the "s3InterfaceFactory" is not in-memory, we use real out of process python.
        #it would be better if this were more explicit
        outOfProcess = self.s3InterfaceFactory is not None and self.s3InterfaceFactory.isCompatibleWithOutOfProcessDownloadPool

        self.outOfProcessPythonTasks = OutOfProcessPythonTasks.OutOfProcessPythonTasks(outOfProcess=outOfProcess)

        self.vdm.initializeOutOfProcessPythonTasks(self.outOfProcessPythonTasks.nativeTasks)

        checkpointInterval = config.cumulusCheckpointIntervalSeconds
        if checkpointInterval == 0:
            checkpointPolicy = CumulusNative.CumulusCheckpointPolicy.None()
        else:
            checkpointPolicy = CumulusNative.CumulusCheckpointPolicy.Periodic(
                checkpointInterval,
                1024 * 1024
                )

        self.cumulusWorker = self.constructCumlusWorker(
            callbackScheduler,
            CumulusNative.CumulusWorkerConfiguration(
                self.machineId,
                self.cumulusThreadCountOverride,
                checkpointPolicy,
                ExecutionContext.createContextConfiguration(),
                diagnosticsDir or ""
                ),
            self.vdm,
            self.offlineCache,
            eventHandler
            )

        self.datasetLoadService = None
        if self.s3InterfaceFactory:
            externalDatasetChannel = self.cumulusWorker.getExternalDatasetRequestChannel(
                callbackScheduler
                )
            self.datasetLoadService = PythonIoTaskService.PythonIoTaskService(
                self.s3InterfaceFactory,
                self.objectStore,
                self.vdm,
                externalDatasetChannel.makeQueuelike(callbackScheduler)
                )

        self.cumulusWorker.startComputations()

        if self.datasetLoadService:
            self.datasetLoadService.startService()
Example #10
0
    def doChannelHandshake(self, channel):
        try:
            logging.debug("Worker %s beginning channel handshake", self.machineId)
            version = channel.getTimeout(HANDSHAKE_TIMEOUT)
            if version is None:
                logging.error(
                    "CAN'T ACCEPT CONNECTION!\n"
                    "CumulusService %s couldn't read client version within the configured timeout",
                    self.machineId
                    )

            if version != ufora.version:
                self.logBadUforaVersionOnChannel(version)
                channel.disconnect()
                return

            logging.debug(
                "CumulusService %s accepted connection from client with version %s",
                self.machineId,
                version
                )

            msgThatShouldBeMyOwnHash = channel.getTimeout(HANDSHAKE_TIMEOUT)
            if not self.isOwnHashInHandshakeMessage(msgThatShouldBeMyOwnHash):
                channel.disconnect()
                return

            msg = channel.getTimeout(HANDSHAKE_TIMEOUT)
            if msg is None:
                logging.error(
                    "CAN'T ACCEPT CONNECTION!\n"
                    "Worker %s didn't received remote machine ID during handshake",
                    self.machineId
                    )
                channel.disconnect()
                return

            clientOrMachine = CumulusNative.CumulusClientOrMachine.Machine(
                CumulusNative.MachineId(
                    Hash.Hash(0)
                    )
                )
            clientOrMachine.__setstate__(msg)

            hashGuid = Hash.Hash(0)
            msg = channel.getTimeout(HANDSHAKE_TIMEOUT)
            if msg is None:
                logging.error(
                    "CAN'T ACCEPT CONNECTION!\n"
                    "Worker %s didn't received handshake GUID",
                    self.machineId
                    )
                channel.disconnect()
                return

            hashGuid.__setstate__(msg)
            logging.debug(
                "Worker %s accepted connection with guid %s from %s",
                self.machineId,
                hashGuid,
                clientOrMachine
                )

            channel.write(
                ModuleImporter.builtinModuleImplVal().hash.__getstate__()
                )

            with self.lock:
                self._channelListener.setGroupIdForAcceptedChannel(
                    channel,
                    (clientOrMachine, hashGuid)
                    )

            logging.debug("CumulusService %s added a channel to group %s",
                          self.machineId,
                          (clientOrMachine, hashGuid))
        except:
            logging.error("FAILED TO PROCESS INCOMING CONNECTION: %s", traceback.format_exc())
            channel.disconnect()
Example #11
0
    def __init__(self, ownAddress, channelListener, channelFactory,
                 eventHandler, callbackScheduler, diagnosticsDir, config,
                 viewFactory):
        Stoppable.Stoppable.__init__(self)

        #acquire a machineId randomly, using uuid
        self.machineId = CumulusNative.MachineId(
            Hash.Hash.sha1(str(uuid.uuid4())))

        self.ownAddress = ownAddress
        self.callbackScheduler = callbackScheduler
        self.viewFactory = viewFactory
        self.threadsStarted_ = False
        self.connectedMachines = set()
        self.connectingMachines = set(
        )  # machines we are in the process of connecting to
        self.droppedMachineIds = set()
        self.lock = threading.RLock()
        self.cumulusMaxRamCacheSizeOverride = config.cumulusMaxRamCacheMB * 1024 * 1024
        self.cumulusVectorRamCacheSizeOverride = config.cumulusVectorRamCacheMB * 1024 * 1024
        self.cumulusThreadCountOverride = config.cumulusServiceThreadCount
        self.cumulusTrackTcMalloc = config.cumulusTrackTcmalloc

        self.reconnectPersistentCacheIndexViewThreads = []

        if config.cumulusDiskCacheStorageSubdirectory is not None:
            self.cumulusDiskCacheWantsDeletionOnTeardown = True
            self.cumulusDiskCacheStorageDir = os.path.join(
                config.cumulusDiskCacheStorageDir,
                config.cumulusDiskCacheStorageSubdirectory)
        else:
            self.cumulusDiskCacheWantsDeletionOnTeardown = False
            self.cumulusDiskCacheStorageDir = config.cumulusDiskCacheStorageDir

        logging.info(
            "Creating a CumulusService with ram cache of %s / %s MB and %s threads",
            self.cumulusVectorRamCacheSizeOverride / 1024.0 / 1024.0,
            self.cumulusMaxRamCacheSizeOverride / 1024.0 / 1024.0,
            self.cumulusThreadCountOverride)

        self._stopEvent = threading.Event()

        self._channelListener = channelListener
        assert len(self._channelListener.ports) == 2
        self._channelFactory = channelFactory

        Runtime.initialize()
        ModuleImporter.initialize()

        self.cumulusActiveMachines = CumulusActiveMachines.CumulusActiveMachines(
            self.viewFactory)

        self.cumulusChannelFactoryThread = ManagedThread.ManagedThread(
            target=self._channelListener.start)

        self.vdm = VectorDataManager.constructVDM(
            callbackScheduler, self.cumulusVectorRamCacheSizeOverride,
            self.cumulusMaxRamCacheSizeOverride)

        if self.cumulusTrackTcMalloc:
            logging.info(
                "CumulusService enabling track-tc-malloc memory with a max cache of %s MB",
                self.cumulusMaxRamCacheSizeOverride / 1024 / 1024.0)
            self.vdm.getMemoryManager().enableCountTcMallocMemoryAsEcMemory()

        self.persistentCacheIndex = CumulusNative.PersistentCacheIndex(
            viewFactory.createView(retrySeconds=10.0, numRetries=10),
            callbackScheduler)

        self.vdm.setPersistentCacheIndex(self.persistentCacheIndex)

        self.deleteCumulusDiskCacheIfNecessary()

        self.offlineCache = CumulusNative.DiskOfflineCache(
            callbackScheduler, self.cumulusDiskCacheStorageDir,
            config.cumulusDiskCacheStorageMB * 1024 * 1024,
            config.cumulusDiskCacheStorageFileCount)

        checkpointInterval = config.cumulusCheckpointIntervalSeconds
        if checkpointInterval == 0:
            checkpointPolicy = CumulusNative.CumulusCheckpointPolicy.None ()
        else:
            checkpointPolicy = CumulusNative.CumulusCheckpointPolicy.Periodic(
                checkpointInterval, 1024 * 1024)

        self.cumulusWorker = self.constructCumlusWorker(
            callbackScheduler,
            CumulusNative.CumulusWorkerConfiguration(
                self.machineId,
                self.cumulusThreadCountOverride, checkpointPolicy,
                ExecutionContext.createContextConfiguration(), diagnosticsDir
                or ""), self.vdm, self.offlineCache, eventHandler)

        #externalDatasetChannel = self.cumulusWorker.getExternalDatasetRequestChannel(
        #callbackScheduler
        #)
        #self.datasetLoadService = PythonIoTaskService.PythonIoTaskService(
        #settings.s3InterfaceFactory,
        #settings.objectStore,
        #self.vdm,
        #externalDatasetChannel.makeQueuelike(callbackScheduler)
        #)

        self.cumulusWorker.startComputations()