Esempio n. 1
0
 def drain(self):
     if self.isDrainable():
         self.isOpen = False
         self.log("Draining...")
         drained = parallelExec(self._drainActor, self.actors.values())
         self.log("Drained.")
         gevent.spawn_later(2, _stopAllActors)
         return True
     else:
         self.log("Cannot drain, some Actors are not drainable.")
         return False
Esempio n. 2
0
    def _svc_instance_draining(self):
        while not self.stopEvent.wait(60 * 1):
            now = int(time.time())

            # First we evaluate actors with a time_to_drain.
            for uid, info in self.actorInfo.items():
                if info['time_to_drain'] is None:
                    continue

                # This makes the assumption that the actor is running isolated, and if not the
                # entire actor host instance will be dained anyway.
                if now > info['start'] + info['time_to_drain']:
                    if self._isDrainable(info['instance']):
                        self._log(
                            'Actor %s has reached time_to_drain, draining.' %
                            info['name'])
                        self._doDrainInstance(info['instance'])
                        self.isInstanceChanged.set()
                    else:
                        self._log(
                            'Actor %s has reached time_to_drain, but instance marked undrainable.'
                            % info['name'])

            # Then we look at the general draining case.
            currentMemory = psutil.virtual_memory()
            # We start looking at draining if we hit more than 80% usage globally.
            if currentMemory.percent < self.highMemWatermark:
                #self._log( "Memory usage at %s percent, nothing to do." % currentMemory.percent )
                continue
            self._log(
                "High memory watermark reached, trying to drain some instances."
            )
            now = time.time()
            drainable = [
                x for x in parallelExec(self._isDrainable, self.processes[:])
                if type(x) is dict
            ]
            self._log("Found %d instances available for draining." %
                      len(drainable))
            oldest = None
            for instance in drainable:
                if instance['p'] is not None:
                    if oldest is None:
                        oldest = instance
                    elif oldest['start'] > instance['start']:
                        oldest = instance

            # Drain the oldest if we have one.
            if oldest is not None:
                self._log('Trying to drain %s' % oldest['id'])
                # Remove all actors in that instance from the directory before draining.
                self._doDrainInstance(oldest)
                self.isInstanceChanged.set()
Esempio n. 3
0
    def _svc_receiveOpsTasks( self ):
        z = self.opsSocket.getChild()
        while not self.stopEvent.wait( 0 ):
            data = z.recv()
            if data is not False and 'req' in data:
                action = data[ 'req' ]
                #start = time.time()
                #self._log( "Received new ops request: %s" % action )
                if 'keepalive' == action:
                    z.send( successMessage() )
                    if 'from' in data and data[ 'from' ] not in self.nodes:
                        self._log( "Discovered new node: %s" % data[ 'from' ] )
                        self._connectToNode( data[ 'from' ] )
                    for other in data.get( 'others', [] ):
                        if other not in self.nodes:
                            self._log( "Discovered new node: %s" % other )
                            self._connectToNode( other )
                elif 'start_actor' == action:
                    if not self._isPrivileged( data ):
                        z.send( errorMessage( 'unprivileged' ) )
                    elif 'actor_name' not in data or 'cat' not in data:
                        z.send( errorMessage( 'missing information to start actor' ) )
                    else:
                        actorName = data[ 'actor_name' ]
                        categories = data[ 'cat' ]
                        realm = data.get( 'realm', 'global' )
                        parameters = data.get( 'parameters', {} )
                        resources = data.get( 'resources', {} )
                        ident = data.get( 'ident', None )
                        trusted = data.get( 'trusted', [] )
                        n_concurrent = data.get( 'n_concurrent', 1 )
                        is_drainable = data.get( 'is_drainable', False )
                        time_to_drain = data.get( 'time_to_drain', None )
                        owner = data.get( 'owner', None )
                        isIsolated = data.get( 'isolated', False )
                        log_level = data.get( 'loglevel', None )
                        log_dest = data.get( 'logdest', None )
                        uid = str( uuid.uuid4() )
                        port = self._getAvailablePortForUid( uid )
                        instance = self._getInstanceForActor( isIsolated )
                        if instance is not None:
                            self._setActorMtd( uid, instance, actorName, realm, isIsolated, owner, parameters, resources, time_to_drain )
                            newMsg = instance[ 'socket' ].request( { 'req' : 'start_actor',
                                                                     'actor_name' : actorName,
                                                                     'realm' : realm,
                                                                     'uid' : uid,
                                                                     'ip' : self.ifaceIp4,
                                                                     'port' : port,
                                                                     'parameters' : parameters,
                                                                     'resources' : resources,
                                                                     'ident' : ident,
                                                                     'trusted' : trusted,
                                                                     'n_concurrent' : n_concurrent,
                                                                     'is_drainable' : is_drainable,
                                                                     'isolated' : isIsolated,
                                                                     'loglevel' : log_level,
                                                                     'logdest' : log_dest },
                                                                   timeout = 30 )
                        else:
                            newMsg = False

                        if isMessageSuccess( newMsg ):
                            self._log( "New actor loaded (isolation = %s, concurrent = %d), adding to directory" % ( isIsolated, n_concurrent ) )
                            # We always add a hardcoded special category _ACTORS/actorUid to provide a way for certain special actors
                            # to talk to specific instances directly, but this is discouraged.
                            with self.dirLock.writer():
                                self.reverseDir[ uid ] = 'tcp://%s:%d' % ( self.ifaceIp4, port )
                                self.directory.setdefault( realm,
                                                           PrefixDict() ).setdefault( '_ACTORS/%s' % ( uid, ),
                                                                                      {} )[ uid ] = 'tcp://%s:%d' % ( self.ifaceIp4,
                                                                                                                      port )
                                self.nonOptDir.setdefault( realm,
                                                           {} ).setdefault( '_ACTORS/%s' % ( uid, ),
                                                                            {} )[ uid ] = 'tcp://%s:%d' % ( self.ifaceIp4,
                                                                                                            port )
                                for category in categories:
                                    self.directory.setdefault( realm,
                                                               PrefixDict() ).setdefault( category,
                                                                                          {} )[ uid ] = 'tcp://%s:%d' % ( self.ifaceIp4,
                                                                                                                          port )
                                    self.nonOptDir.setdefault( realm,
                                                               {} ).setdefault( category,
                                                                                {} )[ uid ] = 'tcp://%s:%d' % ( self.ifaceIp4,
                                                                                                                port )
                            self.isActorChanged.set()
                        else:
                            self._logCritical( 'Error loading actor %s: %s.' % ( actorName, newMsg ) )
                            self._removeUidFromDirectory( uid )
                            self._addTombstone( uid )
                            self._removeInstanceIfIsolated( instance )
                        z.send( newMsg )
                elif 'kill_actor' == action:
                    if not self._isPrivileged( data ):
                        z.send( errorMessage( 'unprivileged' ) )
                    elif 'uid' not in data:
                        z.send( errorMessage( 'missing information to stop actor' ) )
                    else:
                        uids = data[ 'uid' ]
                        if not isinstance( uids, ( tuple, list ) ):
                            uids = ( uids, )

                        failed = []

                        for uid in uids:
                            instance = self.actorInfo.get( uid, {} ).get( 'instance', None )

                            if instance is None:
                                failed.append( errorMessage( 'actor not found' ) )
                            else:
                                newMsg = instance[ 'socket' ].request( { 'req' : 'kill_actor',
                                                                         'uid' : uid },
                                                                       timeout = 20 )
                                if not isMessageSuccess( newMsg ):
                                    self._log( "failed to kill actor %s: %s" % ( uid, str( newMsg ) ) )
                                    failed.append( newMsg )

                                if not self._removeUidFromDirectory( uid ):
                                    failed.append( errorMessage( 'error removing actor from directory after stop' ) )

                                self._addTombstone( uid )

                                self._removeInstanceIfIsolated( instance )

                        self.isActorChanged.set()

                        if 0 != len( failed ):
                            z.send( errorMessage( 'some actors failed to stop', failed ) )
                        else:
                            z.send( successMessage() )
                elif 'remove_actor' == action:
                    if not self._isPrivileged( data ):
                        z.send( errorMessage( 'unprivileged' ) )
                    elif 'uid' not in data:
                        z.send( errorMessage( 'missing information to remove actor' ) )
                    else:
                        uid = data[ 'uid' ]
                        instance = self.actorInfo.get( uid, {} ).get( 'instance', None )
                        if instance is not None and self._removeUidFromDirectory( uid ):
                            z.send( successMessage() )
                            self.isActorChanged.set()
                            self._removeInstanceIfIsolated( instance )
                        else:
                            z.send( errorMessage( 'actor to stop not found' ) )
                elif 'host_info' == action:
                    if self.lastHostInfo is None or time.time() >= self.lastHostInfoCheck + 10:
                        self.lastHostInfoCheck = time.time()
                        self.lastHostInfo = { 'info' : { 'cpu' : psutil.cpu_percent( percpu = True,
                                                                                     interval = 2 ),
                                                         'mem' : psutil.virtual_memory().percent } }
                    z.send( successMessage( self.lastHostInfo ) )
                elif 'get_full_dir' == action:
                    with self.dirLock.reader():
                        #z.send( successMessage( { 'realms' : { k : dict( v ) for k, v in self.directory.iteritems() }, 'reverse' : self.reverseDir } ), isSkipSanitization = True )
                        z.send( successMessage( { 'realms' : self.nonOptDir, 'reverse' : self.reverseDir, 'is_inited' : self.isInitialSyncDone } ), isSkipSanitization = True )
                elif 'get_dir' == action:
                    realm = data.get( 'realm', 'global' )
                    if 'cat' in data:
                        z.send( successMessage( data = { 'endpoints' : self._getDirectoryEntriesFor( realm, data[ 'cat' ] ) } ) )
                    else:
                        z.send( errorMessage( 'no category specified' ) )
                elif 'get_cats_under' == action:
                    realm = data.get( 'realm', 'global' )
                    if 'cat' in data:
                        with self.dirLock.reader():
                            z.send( successMessage( data = { 'categories' : [ x for x in self.directory.get( realm, PrefixDict() ).startswith( data[ 'cat' ] ) if x != data[ 'cat' ] ] } ) )
                    else:
                        z.send( errorMessage( 'no category specified' ) )
                elif 'get_nodes' == action:
                    nodeList = {}
                    for k in self.nodes.keys():
                        nodeList[ k ] = { 'last_seen' : self.nodes[ k ][ 'last_seen' ] }
                    z.send( successMessage( { 'nodes' : nodeList } ) )
                elif 'flush' == action:
                    if not self._isPrivileged( data ):
                        z.send( errorMessage( 'unprivileged' ) )
                    else:
                        resp = successMessage()
                        actors = self.actorInfo.items()
                        for uid, actor in actors:
                            self._removeUidFromDirectory( uid )

                        results = parallelExec( lambda x: x[ 1 ][ 'instance' ][ 'socket' ].request( { 'req' : 'kill_actor', 'uid' : x[ 0 ] }, timeout = 30 ), 
                                                actors )

                        if all( isMessageSuccess( x ) for x in results ):
                            self._log( "all actors stopped" )
                        else:
                            resp = errorMessage( 'error stopping actor' )

                        for uid, actor in actors:
                            self._removeInstanceIfIsolated( actor[ 'instance' ] )

                        z.send( resp )

                        if isMessageSuccess( resp ):
                            self.isActorChanged.set()
                elif 'get_dir_sync' == action:
                    with self.dirLock.reader():
                        #z.send( successMessage( { 'directory' : { k : dict( v ) for k, v in self.directory.iteritems() }, 'tombstones' : self.tombstones, 'reverse' : self.reverseDir } ), isSkipSanitization = True )
                        z.send( successMessage( { 'directory' : self.nonOptDir, 'tombstones' : self.tombstones, 'reverse' : self.reverseDir } ), isSkipSanitization = True )
                elif 'push_dir_sync' == action:
                    if 'directory' in data and 'tombstones' in data and 'reverse' in data:
                        z.send( successMessage() )
                        for uid, ts in data[ 'tombstones' ].iteritems():
                            self._addTombstone( uid, ts )
                        self._updateDirectoryWith( self.directory, self.nonOptDir, data[ 'directory' ], data[ 'reverse' ] )
                    else:
                        z.send( errorMessage( 'missing information to update directory' ) )
                elif 'get_full_mtd' == action:
                    z.send( successMessage( { 'mtd' : self.actorInfo } ) )
                elif 'get_load_info' == action:
                    info = {}
                    for instance in self.processes:
                        tmp = instance[ 'socket' ].request( { 'req' : 'get_load_info' }, timeout = 5 )
                        if isMessageSuccess( tmp ):
                            info.update( tmp[ 'data' ] )
                    z.send( successMessage( { 'load' : info } ) )
                elif 'associate' == action:
                    if not self._isPrivileged( data ):
                        z.send( errorMessage( 'unprivileged' ) )
                    else:
                        uid = data[ 'uid' ]
                        category = data[ 'category' ]
                        try:
                            info = self.actorInfo[ uid ]
                            with self.dirLock.writer():
                                self.directory.setdefault( info[ 'realm' ],
                                                           PrefixDict() ).setdefault( category,
                                                                                      {} )[ uid ] = 'tcp://%s:%d' % ( self.ifaceIp4,
                                                                                                                      info[ 'port' ] )
                                self.nonOptDir.setdefault( info[ 'realm' ],
                                                           {} ).setdefault( category,
                                                                            {} )[ uid ] = 'tcp://%s:%d' % ( self.ifaceIp4,
                                                                                                            info[ 'port' ] )
                        except:
                            z.send( errorMessage( 'error associating, actor hosted here?' ) )
                        else:
                            self.isActorChanged.set()
                            z.send( successMessage() )
                elif 'disassociate' == action:
                    if not self._isPrivileged( data ):
                        z.send( errorMessage( 'unprivileged' ) )
                    else:
                        uid = data[ 'uid' ]
                        category = data[ 'category' ]
                        try:
                            info = self.actorInfo[ uid ]
                            with self.dirLock.writer():
                                self.directory[ info[ 'realm' ] ][ category ].pop( uid )
                                self.nonOptDir[ info[ 'realm' ] ][ category ].pop( uid )
                                if 0 == len( self.directory[ info[ 'realm' ] ][ category ] ):
                                    del( self.directory[ info[ 'realm' ] ][ category ] )
                                    del( self.nonOptDir[ info[ 'realm' ] ][ category ] )
                        except:
                            z.send( errorMessage( 'error associating, actor exists in category?' ) )
                        else:
                            self.isActorChanged.set()
                            z.send( successMessage() )
                else:
                    z.send( errorMessage( 'unknown request', data = { 'req' : action } ) )

                #self._log( "Action %s done after %s seconds." % ( action, time.time() - start ) )
            else:
                z.send( errorMessage( 'invalid request' ) )
                self._logCritical( "Received completely invalid request" )
Esempio n. 4
0
    def __init__( self, configFile, logging_level, logging_dest, iface = None ):
        
        # Setting the signal handler to trigger the stop event
        global timeToStopEvent
        gevent.signal( signal.SIGQUIT, _stop )
        gevent.signal( signal.SIGINT, _stop )
        gevent.signal( signal.SIGTERM, _stop )

        self._logger = None
        self._log_level = logging_level
        self._log_dest = logging_dest
        self._initLogging( logging_level, logging_dest )
        
        self.stopEvent = timeToStopEvent
        self.py_beach_dir = None
        self.configFilePath = os.path.abspath( configFile )
        self.configFile = None
        self.directory = {}
        self.isInitialSyncDone = False
        # This is an unoptimized version of self.directory we maintain because converting
        # the optimized version to a striaght is very expensive.
        self.nonOptDir = {}
        self.reverseDir = {}
        self.tombstones = {}
        self.actorInfo = {}
        self.ports_available = Set()
        self.nProcesses = 0
        self.processes = []
        self.seedNodes = []
        self.directoryPort = None
        self.opsPort = 0
        self.opsSocket = None
        self.port_range = ( 0, 0 )
        self.interface = None
        self.ifaceIp4 = None
        self.nodes = {}
        self.peer_keepalive_seconds = 0
        self.instance_keepalive_seconds = 0
        self.tombstone_culling_seconds = 0
        self.isActorChanged = gevent.event.Event()
        self.isTombstoneChanged = gevent.event.Event()
        self.dirLock = RWLock()
        self.lastHostInfo = None
        self.lastHostInfoCheck = 0

        # Cleanup potentially old sockets
        os.system( 'rm -f /tmp/py_beach*' )

        # Load default configs
        with open( self.configFilePath, 'r' ) as f:
            self.configFile = yaml.load( f )

        self.py_beach_dir = os.path.dirname( os.path.abspath( __file__ ) )

        os.chdir( os.path.dirname( os.path.abspath( self.configFilePath ) ) )

        self.private_key = self.configFile.get( 'private_key', None )
        if self.private_key is not None:
            with open( self.private_key, 'r' ) as f:
                key_path = self.private_key
                self.private_key = f.read()
                self._log( "Using shared key: %s" % key_path )

        self.admin_token = self.configFile.get( 'admin_token', None )

        self.nProcesses = self.configFile.get( 'n_processes', 0 )
        if self.nProcesses == 0:
            self.nProcesses = multiprocessing.cpu_count()
        self._log( "Using %d instances per node" % self.nProcesses )

        if iface is not None:
            self.interface = iface
            self.ifaceIp4 = _getIpv4ForIface( self.interface )
            if self.ifaceIp4 is None:
                self._logCritical( "Could not use iface %s (from cli)." % self.interface )
                sys.exit( -1 )
        else:
            self.interface = self.configFile.get( 'interface', None )
            if self.interface is not None:
                self.ifaceIp4 = _getIpv4ForIface( self.interface )
                if self.ifaceIp4 is None:
                    self._logCritical( "Could not use iface %s (from config)." % self.interface )
                    sys.exit( -1 )

        # Building a list of interfaces to auto-detect
        defaultInterfaces = _getPublicInterfaces()
        while self.ifaceIp4 is None and 0 != len( defaultInterfaces ):
            self.interface = defaultInterfaces.pop()
            self.ifaceIp4 = _getIpv4ForIface( self.interface )
            if self.ifaceIp4 is None:
                self._log( "Failed to use interface %s." % self.interface )

        if self.ifaceIp4 is None:
            self._logCritical( "Could not find an interface to use." )
            sys.exit( -1 )

        self.seedNodes = self.configFile.get( 'seed_nodes', [] )

        if 0 == len( self.seedNodes ):
            self.seedNodes.append( self.ifaceIp4 )

        for s in self.seedNodes:
            self._log( "Using seed node: %s" % s )

        self.directoryPort = _ZMREP( self.configFile.get( 'directory_port',
                                                          'ipc:///tmp/py_beach_directory_port' ),
                                    isBind = True,
                                    private_key = self.private_key )
        
        self.opsPort = self.configFile.get( 'ops_port', 4999 )
        self.opsSocket = _ZMREP( 'tcp://%s:%d' % ( self.ifaceIp4, self.opsPort ),
                                 isBind = True,
                                 private_key = self.private_key )
        self._log( "Listening for ops on %s:%d" % ( self.ifaceIp4, self.opsPort ) )
        
        self.port_range = ( self.configFile.get( 'port_range_start', 5000 ), self.configFile.get( 'port_range_end', 6000 ) )
        self.ports_available.update( xrange( self.port_range[ 0 ], self.port_range[ 1 ] + 1 ) )
        
        self.peer_keepalive_seconds = self.configFile.get( 'peer_keepalive_seconds', 60 )
        self.instance_keepalive_seconds = self.configFile.get( 'instance_keepalive_seconds', 600 )
        self.directory_sync_seconds = self.configFile.get( 'directory_sync_seconds', 600 )
        self.tombstone_culling_seconds = self.configFile.get( 'tombstone_culling_seconds', 3600 )
        
        self.instance_strategy = self.configFile.get( 'instance_strategy', 'random' )

        self.highMemWatermark = self.configFile.get( 'high_mem_watermark', 80 )
        
        # Bootstrap the seeds
        for s in self.seedNodes:
            self._connectToNode( s )
        
        # Start services
        self._log( "Starting services" )
        gevent.spawn_later( random.randint( 0, 3 ), self._svc_directory_requests )
        gevent.spawn_later( random.randint( 0, 3 ), self._svc_instance_keepalive )
        gevent.spawn_later( random.randint( 0, 3 ), self._svc_host_keepalive )
        gevent.spawn_later( random.randint( 0, 3 ), self._svc_directory_sync )
        gevent.spawn_later( random.randint( 0, 3 ), self._svc_cullTombstones )
        gevent.spawn_later( random.randint( 0, 3 ), self._svc_applyTombstones )
        gevent.spawn_later( random.randint( 0, 3 ), self._svc_cleanupCats )
        gevent.spawn_later( random.randint( 0, 60 * 5 ), self._svc_instance_draining )
        for _ in range( 20 ):
            gevent.spawn( self._svc_receiveOpsTasks )
        gevent.spawn( self._svc_pushDirChanges )
        
        # Start the instances
        for n in range( self.nProcesses ):
            self._startInstance( isIsolated = False )
        
        # Wait to be signaled to exit
        self._log( "Up and running" )
        timeToStopEvent.wait()
        
        # Any teardown required
        parallelExec( self._teardownInstance, self.processes[:] )
        
        self._log( "Exiting." )
Esempio n. 5
0
    def __init__(self, configFile, instanceId, logging_level, logging_dest,
                 interface):

        # Setting the signal handler to trigger the stop event which
        # is interpreted by each actor implementation
        global timeToStopEvent
        gevent.signal(signal.SIGQUIT, _stopAllActors)
        gevent.signal(signal.SIGINT, _stopAllActors)
        gevent.signal(signal.SIGTERM, _stopAllActors)

        self.instanceId = instanceId

        self._log_level = logging_level
        self._log_dest = logging_dest
        self._initLogging(logging_level, logging_dest)

        self.log("Initializing")

        self.stopEvent = timeToStopEvent
        self.isOpen = True

        self.actors = {}

        self.py_beach_dir = None

        self.configFilePath = configFile
        self.configFile = None

        self.interface = interface
        self.ifaceIp4 = _getIpv4ForIface(self.interface)

        with open(self.configFilePath, 'r') as f:
            self.configFile = yaml.load(f)

        self.py_beach_dir = os.path.dirname(os.path.abspath(__file__))

        os.chdir(os.path.dirname(os.path.abspath(self.configFilePath)))

        self.private_key = self.configFile.get('private_key', None)
        if self.private_key is not None:
            with open(self.private_key, 'r') as f:
                key_path = self.private_key
                self.private_key = f.read()
                self.log("Using shared key: %s" % key_path)

        self.codeDirectory = self.configFile.get('code_directory', './')
        if '://' not in self.codeDirectory:
            self.codeDirectory = os.path.abspath(self.codeDirectory)

        Actor._code_directory_root = self.codeDirectory

        self.opsSocket = _ZMREP('ipc:///tmp/py_beach_instance_%s' % instanceId,
                                isBind=True)
        #self.log( "Listening for ops on %s" % ( 'ipc:///tmp/py_beach_instance_%s' % instanceId, ) )

        self.hostOpsPort = self.configFile.get('ops_port', 4999)
        self.hostOpsSocket = _ZMREP('tcp://%s:%d' %
                                    (self.ifaceIp4, self.hostOpsPort),
                                    isBind=False,
                                    private_key=self.private_key)

        ActorHandle._setHostDirInfo(
            self.configFile.get('directory_port',
                                'ipc:///tmp/py_beach_directory_port'),
            self.private_key)

        ActorHandleGroup._setHostDirInfo(
            'tcp://%s:%d' % (self.ifaceIp4, self.hostOpsPort),
            self.private_key)

        for _ in range(20):
            gevent.spawn(self.svc_receiveTasks)
        gevent.spawn(self.svc_monitorActors)
        gevent.spawn(self.svc_reportUsage)

        #self.log( "Now open to actors" )

        timeToStopEvent.wait()

        self.log("Exiting, stopping all actors.")

        parallelExec(lambda x: x.stop(), self.actors.values())

        gevent.joinall(self.actors.values())
        self.hostOpsSocket.close()
        self.opsSocket.close()
        self.log("All Actors exited, exiting.")