Example #1
0
 def __init__(self, remaproot):
     self.remaproot = remaproot
     self.cores = {}
     self.broker_address = "unknown"
     self.brokerChanged = False
     self.bsub = None
     self.bpub = None
     self.tot_m_rcv = 0
     self.hw = NodeHardware()
     self.nodeid = remap_utils.node_id()
     self.bonjour = BonjourResolver("_remap._tcp", self.cb_broker_changed)
     self.bonjour.start()
     self.coresChanged = False
Example #2
0
 def __init__(self, remaproot):
     self.remaproot = remaproot
     self.cores = {}
     self.broker_address = "unknown"
     self.brokerChanged = False
     self.bsub = None
     self.bpub = None
     self.tot_m_rcv = 0
     self.hw = NodeHardware()
     self.nodeid = remap_utils.node_id()
     self.bonjour = BonjourResolver( "_remap._tcp", self.cb_broker_changed )
     self.bonjour.start()
     self.coresChanged = False
Example #3
0
class NodeDaemon( object ):
    def __init__(self, remaproot):
        self.remaproot = remaproot
        self.cores = {}
        self.broker_address = "unknown"
        self.brokerChanged = False
        self.bsub = None
        self.bpub = None
        self.tot_m_rcv = 0
        self.hw = NodeHardware()
        self.nodeid = remap_utils.node_id()
        self.bonjour = BonjourResolver( "_remap._tcp", self.cb_broker_changed )
        self.bonjour.start()
        self.coresChanged = False

    # Create a bi-directional communication channel, where the node daemon 
    # 'shouts' in the room even to contact a single core, but the core only
    # sends written messages back to the shouter with the megaphone.
    # (embarassing protocol).
    def setup_bus( self ):
        self.lsub = nn.Socket( nn.SUB )
        self.lsub.bind("ipc:///tmp/node_pub.ipc")
        self.lsub.set_string_option( nn.SUB, nn.SUB_SUBSCRIBE, "" )
        self.lpub = nn.Socket( nn.PUB )
        self.lpub.bind("ipc:///tmp/node_sub.ipc")

    def apply_timeouts( self ):
        if self.bsub == None:
            rcv_timeout = 100
            self.lsub.set_int_option( nn.SOL_SOCKET, nn.RCVTIMEO, rcv_timeout )     
        else:
            rcv_timeout = 100
            self.bsub.set_int_option( nn.SOL_SOCKET, nn.RCVTIMEO, rcv_timeout )
            rcv_timeout = 0
            self.lsub.set_int_option( nn.SOL_SOCKET, nn.RCVTIMEO, rcv_timeout )     

    def cb_broker_changed( self, broker_address ):
        logger.info("Received new broker address: %s"%(broker_address) )
        self.broker_address = broker_address
        self.brokerChanged = True

    def setup_broker( self ):
        self.brokerChanged = False
        if self.bsub != None:
            self.bsub.close()
            self.bsub = None

        self.apply_timeouts()

        if self.broker_address == "unknown":
            logger.error("Deferring broker setup as address is still unknown.")
            return

        self.bsub = nn.Socket( nn.SUB )
        self.bsub.connect( "tcp://%s:8687"%( self.broker_address ))
        self.bsub.set_string_option( nn.SUB, nn.SUB_SUBSCRIBE, "global")
        self.bsub.set_string_option( nn.SUB, nn.SUB_SUBSCRIBE, "local")
        self.bsub.set_string_option( nn.SUB, nn.SUB_SUBSCRIBE, "notlocal")
        self.bsub.set_string_option( nn.SUB, nn.SUB_SUBSCRIBE, self.nodeid)
        self.apply_timeouts()

        self.bpub = nn.Socket( nn.PUB )
        self.bpub.connect( "tcp://%s:8686"%( self.broker_address ))

        logger.info("Broker setup complete")

    def process_bus_messages( self ):
        try:
            msg = self.lsub.recv()
            msgprefix, data = remap_utils.unpack_msg( msg )

            if len(msgprefix) == 0:
                return True

            recipientid,msgtype,senderid = remap_utils.split_prefix(msgprefix)

            if msgtype[0] == '_':
                # node message
                self.process_core_message( msgtype, senderid, data )
            elif msgtype == "corestatus":
                if senderid in self.cores:                
                    coredata = self.cores[ senderid ]
                    coredata["ts_last_seen"] = time.time()
                    coredata["progress"] = data["progress"]
                    logger.info("Core %s progressed %d"%( senderid, coredata["progress"] ))
                    self.forward_to_broker( msg )
            elif msgtype == "complete":
                if senderid in self.cores:                
                    coredata = self.cores[ senderid ]
                    logger.info("Core %s completed the job"%( senderid ))
                    self.forward_to_broker( msg )
                    del self.cores[ senderid ]
                    self.coresChanged = True
            else:
                # forward to broker instead
                self.forward_to_broker( msg )             
            return True
        except nn.NanoMsgAPIError as e:
            return False

    def process_core_message( self, msgtype, senderid, data ):
        if msgtype == "_hello":
            self.process_hello( data )
        if msgtype == "_todo":
            self.process_todo( senderid, data )
        if msgtype == "_status":
            self.process_status( senderid, data )
        if msgtype == "_sub":
            self.bsub.set_string_option( nn.SUB, nn.SUB_SUBSCRIBE, data["prefix"])
        if msgtype == "_unsub":
            self.bsub.set_string_option( nn.SUB, nn.SUB_UNSUBSCRIBE, data["prefix"])

    def forward_to_broker( self, msg ):
        if self.bpub != None:
            try:
                self.bpub.send( msg )
            except nn.NanoMsgAPIError as e:
                pass

    # This processes a message where a core is announcing itself and wants to 
    # get a core id to start existing on the network    
    def process_hello( self, data ):
        msgid = remap_utils.safe_get(data, "msgid")
        pid = remap_utils.safe_get(data, "pid")
        priority = remap_utils.safe_get( data, "priority" )
        coreid = remap_utils.core_id( self.nodeid, pid )
        self.cores[ coreid ] = {"coreid":coreid,"ts_last_seen":time.time(),"progress":-1,"pid":pid,"priority":priority}
        msg = remap_utils.pack_msg( "%s._hey.%s"%(coreid, self.nodeid), {"msgid":msgid,"coreid":coreid} )
        logger.info( "A core registered %s"%( coreid ))
        self.lpub.send( msg )

    def process_todo( self, senderid, data ):
        coredata = self.cores[ senderid ]
        work = self.hw.grab_work_item()
        if work != None:
            msg = remap_utils.pack_msg( "%s._work.%s"%(senderid, self.nodeid), work )
            logger.info( "A core was given some work to do: %s"%( senderid ))
            self.lpub.send( msg )

    def process_status( self, senderid, data ):
        coredata = self.cores[ senderid ]
        coredata["ts_last_seen"] = time.time()

    def process_broker_messages( self ):
        if self.bsub == None:
            # No broker is known yet.
            if self.brokerChanged:
                logger.info("The broker configuration changed.")
                self.setup_broker()
                if self.bsub == None:
                    logger.info("Failed broker setup.")
                    return False
            else:              
                return False

        try:
            # Grab next msg from broker if any
            msg = self.bsub.recv()
            self.tot_m_rcv = self.tot_m_rcv + 1
            if msg == None or len(msg)==0:
                return False

            msgprefix, data = remap_utils.unpack_msg( msg )
            recipientid,msgtype,senderid = remap_utils.split_prefix(msgprefix)
            if msgtype == "showhands":
                self.handle_showhands( recipientid, senderid, data )
            elif msgtype == "jobstart":
                #if recipientid == self.nodeid:
                self.handle_jobstart( recipientid, senderid, data )
            else:
                # Forward to all cores for their processing.
                self.lpub.send(msg)
            return True
        except nn.NanoMsgAPIError as e:
            return False

    def purge_inactive_cores( self, new_ts ):
        kill_list = []
        for key, coredata in self.cores.items():
            last_ts = coredata["ts_last_seen"]
            if (new_ts - last_ts) > remap_constants.THR_STATUS_DELAY:
                logger.info("Core %s missed a status report."%( key ))
            if (new_ts - last_ts) > remap_constants.MAX_STATUS_DELAY:
                logger.info("Core %s is considered dead."%( key ))
                kill_list.append( key )
                # Add code here to kill core just in case.

        for key in kill_list:                
            del self.cores[ key ]

    def maybe_send_status( self ):
        if self.coresChanged:
            self.handle_showhands( "tracker", "unknown", { "priority":0 } )
            self.coresChanged = False

    # Request re-registration of existing core processes currently on the bus
    # allows failover restart of this node daemon.
    def req_registration( self ):
        msg = remap_utils.pack_msg( "node._plzreg.%s"%(self.nodeid), {} )
        self.lpub.send( msg )

    # Some app initiator requests processing capacity
    def handle_showhands( self, recipientid, senderid, data ):
        avail, interruptable = self.hw.available_cpus( remap_utils.safe_get( data, "priority" ), self.cores )
        if avail > 0 or interruptable > 0:
            logger.info( "Volunteering with %d cores, %d interruptable"%( avail, interruptable ))
            msg = remap_utils.pack_msg( "tracker.raisehand.%s"%( self.nodeid ), {"free":avail,"interruptable":interruptable} ) 
            self.forward_to_broker( msg )

    # Some app initiator wants this node to start work
    def handle_jobstart( self, recipientid, senderid, data ):
        avail, interruptable = self.hw.available_cpus( remap_utils.safe_get( data, "priority" ), self.cores )
        numcores = len(remap_utils.safe_get( data, "cores" ))
        if (avail + interruptable) >= numcores:
            logger.info("Starting job with %d cores"%( numcores ))
            if not self.hw.start_job( self.remaproot, senderid, numcores, data ):
                logger.error("Error starting job")
        else:
            # Something changed in the meantime. Reject
            logger.info( "Initiator requested %d cores, %d can be committed. Rejecting"%( numcores, avail + interruptable ))
            msg = remap_utils.pack_msg( "%s.rejectjob.%s"%( senderid, self.nodeid ), {} ) 
            self.forward_to_broker( msg )

        self.coresChanged = True
Example #4
0
class NodeDaemon(object):
    def __init__(self, remaproot):
        self.remaproot = remaproot
        self.cores = {}
        self.broker_address = "unknown"
        self.brokerChanged = False
        self.bsub = None
        self.bpub = None
        self.tot_m_rcv = 0
        self.hw = NodeHardware()
        self.nodeid = remap_utils.node_id()
        self.bonjour = BonjourResolver("_remap._tcp", self.cb_broker_changed)
        self.bonjour.start()
        self.coresChanged = False

    # Create a bi-directional communication channel, where the node daemon
    # 'shouts' in the room even to contact a single core, but the core only
    # sends written messages back to the shouter with the megaphone.
    # (embarassing protocol).
    def setup_bus(self):
        self.lsub = nn.Socket(nn.SUB)
        self.lsub.bind("ipc:///tmp/node_pub.ipc")
        self.lsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "")
        self.lpub = nn.Socket(nn.PUB)
        self.lpub.bind("ipc:///tmp/node_sub.ipc")

    def apply_timeouts(self):
        if self.bsub == None:
            rcv_timeout = 100
            self.lsub.set_int_option(nn.SOL_SOCKET, nn.RCVTIMEO, rcv_timeout)
        else:
            rcv_timeout = 100
            self.bsub.set_int_option(nn.SOL_SOCKET, nn.RCVTIMEO, rcv_timeout)
            rcv_timeout = 0
            self.lsub.set_int_option(nn.SOL_SOCKET, nn.RCVTIMEO, rcv_timeout)

    def cb_broker_changed(self, broker_address):
        logger.info("Received new broker address: %s" % (broker_address))
        self.broker_address = broker_address
        self.brokerChanged = True

    def setup_broker(self):
        self.brokerChanged = False
        if self.bsub != None:
            self.bsub.close()
            self.bsub = None

        self.apply_timeouts()

        if self.broker_address == "unknown":
            logger.error("Deferring broker setup as address is still unknown.")
            return

        self.bsub = nn.Socket(nn.SUB)
        self.bsub.connect("tcp://%s:8687" % (self.broker_address))
        self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "global")
        self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "local")
        self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, "notlocal")
        self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE, self.nodeid)
        self.apply_timeouts()

        self.bpub = nn.Socket(nn.PUB)
        self.bpub.connect("tcp://%s:8686" % (self.broker_address))

        logger.info("Broker setup complete")

    def process_bus_messages(self):
        try:
            msg = self.lsub.recv()
            msgprefix, data = remap_utils.unpack_msg(msg)

            if len(msgprefix) == 0:
                return True

            recipientid, msgtype, senderid = remap_utils.split_prefix(
                msgprefix)

            if msgtype[0] == '_':
                # node message
                self.process_core_message(msgtype, senderid, data)
            elif msgtype == "corestatus":
                if senderid in self.cores:
                    coredata = self.cores[senderid]
                    coredata["ts_last_seen"] = time.time()
                    coredata["progress"] = data["progress"]
                    logger.info("Core %s progressed %d" %
                                (senderid, coredata["progress"]))
                    self.forward_to_broker(msg)
            elif msgtype == "complete":
                if senderid in self.cores:
                    coredata = self.cores[senderid]
                    logger.info("Core %s completed the job" % (senderid))
                    self.forward_to_broker(msg)
                    del self.cores[senderid]
                    self.coresChanged = True
            else:
                # forward to broker instead
                self.forward_to_broker(msg)
            return True
        except nn.NanoMsgAPIError as e:
            return False

    def process_core_message(self, msgtype, senderid, data):
        if msgtype == "_hello":
            self.process_hello(data)
        if msgtype == "_todo":
            self.process_todo(senderid, data)
        if msgtype == "_status":
            self.process_status(senderid, data)
        if msgtype == "_sub":
            self.bsub.set_string_option(nn.SUB, nn.SUB_SUBSCRIBE,
                                        data["prefix"])
        if msgtype == "_unsub":
            self.bsub.set_string_option(nn.SUB, nn.SUB_UNSUBSCRIBE,
                                        data["prefix"])

    def forward_to_broker(self, msg):
        if self.bpub != None:
            try:
                self.bpub.send(msg)
            except nn.NanoMsgAPIError as e:
                pass

    # This processes a message where a core is announcing itself and wants to
    # get a core id to start existing on the network
    def process_hello(self, data):
        msgid = remap_utils.safe_get(data, "msgid")
        pid = remap_utils.safe_get(data, "pid")
        priority = remap_utils.safe_get(data, "priority")
        coreid = remap_utils.core_id(self.nodeid, pid)
        self.cores[coreid] = {
            "coreid": coreid,
            "ts_last_seen": time.time(),
            "progress": -1,
            "pid": pid,
            "priority": priority
        }
        msg = remap_utils.pack_msg("%s._hey.%s" % (coreid, self.nodeid), {
            "msgid": msgid,
            "coreid": coreid
        })
        logger.info("A core registered %s" % (coreid))
        self.lpub.send(msg)

    def process_todo(self, senderid, data):
        coredata = self.cores[senderid]
        work = self.hw.grab_work_item()
        if work != None:
            msg = remap_utils.pack_msg("%s._work.%s" % (senderid, self.nodeid),
                                       work)
            logger.info("A core was given some work to do: %s" % (senderid))
            self.lpub.send(msg)

    def process_status(self, senderid, data):
        coredata = self.cores[senderid]
        coredata["ts_last_seen"] = time.time()

    def process_broker_messages(self):
        if self.bsub == None:
            # No broker is known yet.
            if self.brokerChanged:
                logger.info("The broker configuration changed.")
                self.setup_broker()
                if self.bsub == None:
                    logger.info("Failed broker setup.")
                    return False
            else:
                return False

        try:
            # Grab next msg from broker if any
            msg = self.bsub.recv()
            self.tot_m_rcv = self.tot_m_rcv + 1
            if msg == None or len(msg) == 0:
                return False

            msgprefix, data = remap_utils.unpack_msg(msg)
            recipientid, msgtype, senderid = remap_utils.split_prefix(
                msgprefix)
            if msgtype == "showhands":
                self.handle_showhands(recipientid, senderid, data)
            elif msgtype == "jobstart":
                #if recipientid == self.nodeid:
                self.handle_jobstart(recipientid, senderid, data)
            else:
                # Forward to all cores for their processing.
                self.lpub.send(msg)
            return True
        except nn.NanoMsgAPIError as e:
            return False

    def purge_inactive_cores(self, new_ts):
        kill_list = []
        for key, coredata in self.cores.items():
            last_ts = coredata["ts_last_seen"]
            if (new_ts - last_ts) > remap_constants.THR_STATUS_DELAY:
                logger.info("Core %s missed a status report." % (key))
            if (new_ts - last_ts) > remap_constants.MAX_STATUS_DELAY:
                logger.info("Core %s is considered dead." % (key))
                kill_list.append(key)
                # Add code here to kill core just in case.

        for key in kill_list:
            del self.cores[key]

    def maybe_send_status(self):
        if self.coresChanged:
            self.handle_showhands("tracker", "unknown", {"priority": 0})
            self.coresChanged = False

    # Request re-registration of existing core processes currently on the bus
    # allows failover restart of this node daemon.
    def req_registration(self):
        msg = remap_utils.pack_msg("node._plzreg.%s" % (self.nodeid), {})
        self.lpub.send(msg)

    # Some app initiator requests processing capacity
    def handle_showhands(self, recipientid, senderid, data):
        avail, interruptable = self.hw.available_cpus(
            remap_utils.safe_get(data, "priority"), self.cores)
        if avail > 0 or interruptable > 0:
            logger.info("Volunteering with %d cores, %d interruptable" %
                        (avail, interruptable))
            msg = remap_utils.pack_msg("tracker.raisehand.%s" % (self.nodeid),
                                       {
                                           "free": avail,
                                           "interruptable": interruptable
                                       })
            self.forward_to_broker(msg)

    # Some app initiator wants this node to start work
    def handle_jobstart(self, recipientid, senderid, data):
        avail, interruptable = self.hw.available_cpus(
            remap_utils.safe_get(data, "priority"), self.cores)
        numcores = len(remap_utils.safe_get(data, "cores"))
        if (avail + interruptable) >= numcores:
            logger.info("Starting job with %d cores" % (numcores))
            if not self.hw.start_job(self.remaproot, senderid, numcores, data):
                logger.error("Error starting job")
        else:
            # Something changed in the meantime. Reject
            logger.info(
                "Initiator requested %d cores, %d can be committed. Rejecting"
                % (numcores, avail + interruptable))
            msg = remap_utils.pack_msg(
                "%s.rejectjob.%s" % (senderid, self.nodeid), {})
            self.forward_to_broker(msg)

        self.coresChanged = True