コード例 #1
0
ファイル: heartbeat.py プロジェクト: abhirathb/copernicus
 def _sendPing(self, first, last):
     """Do the actual sending"""
     changed=self.cmdsChanged
     self.cmdsChanged=False
     with self.runCondVar:
         # first write the items to xml
         cmds=self.worker._getWorkloads()
         co=StringIO()
         co.write('<heartbeat worker_id="%s">'%self.workerID)
         for item in cmds:
             if item.running:
                 item.hbi.writeXML(co)
                 for subwl in item.joinedTo:
                     subwl.hbi.writeXML(co)
         co.write("</heartbeat>")
     clnt=WorkerMessage()
     resp=clnt.workerHeartbeatRequest(self.workerID, self.workerDir, 
                                      first, last, changed, 
                                      co.getvalue())
     presp=ProcessedResponse(resp)
     if last:
         timestr=" last"
     else:
         timestr="" 
     if first:
         timestr+=" first"
     if changed:
         timestr+=" update"
     log.debug("Sent%s heartbeat signal. Result was %s"%
               (timestr, presp.getStatus()))
     if presp.getStatus() != "OK":
         # if the response was not OK, the upstream server thinks we're 
         # dead and has signaled that to the originating server. We 
         # should just die now.
         faulty=presp.getData()
         log.info("Error from heartbeat request. Stopping %s"%str(faulty))
         #log.error("Got error from heartbeat request. Stopping worker.")
         if ( type(faulty) == type(dict()) and 'faulty' in faulty): 
             for faultyItem in faulty['faulty']:
                 self.worker.killWorkload(faultyItem)
         else:
             pass
             #sys.exit(1)
     respData=presp.getData()
     if type(respData) == type(dict()):
         rettime=int(respData['heartbeat-time'])
         self.randomFile=respData['random-file']
         self._createRandomFile()
     else:
         rettime=int(respData)
     #rettime=int(presp.getData())
     log.debug("Waiting %s seconds for next ping"%(rettime))
     return rettime
コード例 #2
0
 def _sendPing(self, first, last):
     """Do the actual sending"""
     changed = self.cmdsChanged
     self.cmdsChanged = False
     with self.runCondVar:
         # first write the items to xml
         cmds = self.worker._getWorkloads()
         co = StringIO()
         co.write('<heartbeat worker_id="%s">' % self.workerID)
         for item in cmds:
             if item.running:
                 item.hbi.writeXML(co)
                 for subwl in item.joinedTo:
                     subwl.hbi.writeXML(co)
         co.write("</heartbeat>")
     clnt = WorkerMessage()
     resp = clnt.workerHeartbeatRequest(self.workerID, self.workerDir,
                                        first, last, changed, co.getvalue())
     presp = ProcessedResponse(resp)
     if last:
         timestr = " last"
     else:
         timestr = ""
     if first:
         timestr += " first"
     if changed:
         timestr += " update"
     log.debug("Sent%s heartbeat signal. Result was %s" %
               (timestr, presp.getStatus()))
     if presp.getStatus() != "OK":
         # if the response was not OK, the upstream server thinks we're
         # dead and has signaled that to the originating server. We
         # should just die now.
         faulty = presp.getData()
         log.info("Error from heartbeat request. Stopping %s" % str(faulty))
         #log.error("Got error from heartbeat request. Stopping worker.")
         if (type(faulty) == type(dict()) and 'faulty' in faulty):
             for faultyItem in faulty['faulty']:
                 self.worker.killWorkload(faultyItem)
         else:
             pass
             #sys.exit(1)
     respData = presp.getData()
     if type(respData) == type(dict()):
         rettime = int(respData['heartbeat-time'])
         self.randomFile = respData['random-file']
         self._createRandomFile()
     else:
         rettime = int(respData)
     #rettime=int(presp.getData())
     log.debug("Waiting %s seconds for next ping" % (rettime))
     return rettime
コード例 #3
0
ファイル: worker.py プロジェクト: abhirathb/copernicus
    def run(self, serverState, request, response):
        workerID=request.getParam('worker_id')
        workerDir=request.getParam('worker_dir')
        iteration=request.getParam('iteration')
        itemsXML=request.getParam('heartbeat_items')
        version=0
        if request.hasParam('version'):
            version=int(request.getParam('version'))
        hwr=cpc.command.heartbeat.HeartbeatItemReader()
        hwr.readString(itemsXML, "worker heartbeat items")
        heartbeatItems=hwr.getItems()
        # The worker data list
        workerDataList=serverState.getWorkerDataList()
        haveADir=False
        # Order the heartbeat items by destination server
        destList={}
        Nhandled=0
        for item in heartbeatItems:
            dest=item.getServerName()
            item.checkRunDir()
            if item.getHaveRunDir():
                haveADir=True
            if dest in destList:
                destList[dest].append(item)
            else:
                destList[dest]=[item]
            Nhandled+=1
        if haveADir:
            if iteration!="final":
                workerDataList.add(workerDir)
        if iteration=="final":
            workerDataList.remove(workerDir)
        # get my own name to compare
        selfNode= Node.getSelfNode(serverState.conf)
        selfName = selfNode.getId()

        #updating the status at every hearbeat. This is how we knwo that the worker
        # is still talking to the server
        serverState.setWorkerState(WorkerStatus.WORKER_STATUS_CONNECTED,workerID,
                                   request.headers['originating-client'])
        # now iterate over the destinations, and send them their heartbeat
        # items.
        # Once we have many workers, this would be a place to pool heartbeat
        # items and send them as one big request.
        faultyItems=[]
        for dest, items in destList.iteritems():
            if dest == selfName:
                ret=serverState.getRunningCmdList().ping(workerID, workerDir,
                                                         iteration, items, True,
                                                         faultyItems)
            else:
                msg=ServerMessage(dest)
                co=StringIO()
                co.write('<heartbeat worker_id="%s" worker_server_id="%s">'%
                         (workerID, selfName))
                for item in items:
                    item.writeXML(co)
                co.write('</heartbeat>')
                resp = msg.heartbeatForwardedRequest(workerID, workerDir,
                                                     selfName, iteration,
                                                     co.getvalue())
                presp=ProcessedResponse(resp)
                if presp.getStatus() != "OK":
                    log.info("Heartbeat response from %s not OK"%dest)
                    retitems=presp.getData()
                    for item in retitems:
                        faultyItems.append(item)
        if version > 1:
            retData = { 'heartbeat-time' : serverState.conf.
                                                getHeartbeatTime(),
                        'random-file': workerDataList.getRnd(workerDir) }
        else:
            retData=serverState.conf.getHeartbeatTime()
        if len(faultyItems)==0:
            response.add('', data=retData)
        else:
            if version > 1:
                retData['faulty']=faultyItems
            # TODO: per-workload error reporting
            response.add('Heatbeat NOT OK', status="ERROR", data=retData)
        log.info("Handled %d heartbeat signal items."%(Nhandled))