def _sendPing(self, first, last): """Do the actual sending""" changed=self.cmdsChanged self.cmdsChanged=False with self.runCondVar: # first write the items to xml cmds=self.worker._getWorkloads() co=StringIO() co.write('<heartbeat worker_id="%s">'%self.workerID) for item in cmds: if item.running: item.hbi.writeXML(co) for subwl in item.joinedTo: subwl.hbi.writeXML(co) co.write("</heartbeat>") clnt=WorkerMessage() resp=clnt.workerHeartbeatRequest(self.workerID, self.workerDir, first, last, changed, co.getvalue()) presp=ProcessedResponse(resp) if last: timestr=" last" else: timestr="" if first: timestr+=" first" if changed: timestr+=" update" log.debug("Sent%s heartbeat signal. Result was %s"% (timestr, presp.getStatus())) if presp.getStatus() != "OK": # if the response was not OK, the upstream server thinks we're # dead and has signaled that to the originating server. We # should just die now. faulty=presp.getData() log.info("Error from heartbeat request. Stopping %s"%str(faulty)) #log.error("Got error from heartbeat request. Stopping worker.") if ( type(faulty) == type(dict()) and 'faulty' in faulty): for faultyItem in faulty['faulty']: self.worker.killWorkload(faultyItem) else: pass #sys.exit(1) respData=presp.getData() if type(respData) == type(dict()): rettime=int(respData['heartbeat-time']) self.randomFile=respData['random-file'] self._createRandomFile() else: rettime=int(respData) #rettime=int(presp.getData()) log.debug("Waiting %s seconds for next ping"%(rettime)) return rettime
def _sendPing(self, first, last): """Do the actual sending""" changed = self.cmdsChanged self.cmdsChanged = False with self.runCondVar: # first write the items to xml cmds = self.worker._getWorkloads() co = StringIO() co.write('<heartbeat worker_id="%s">' % self.workerID) for item in cmds: if item.running: item.hbi.writeXML(co) for subwl in item.joinedTo: subwl.hbi.writeXML(co) co.write("</heartbeat>") clnt = WorkerMessage() resp = clnt.workerHeartbeatRequest(self.workerID, self.workerDir, first, last, changed, co.getvalue()) presp = ProcessedResponse(resp) if last: timestr = " last" else: timestr = "" if first: timestr += " first" if changed: timestr += " update" log.debug("Sent%s heartbeat signal. Result was %s" % (timestr, presp.getStatus())) if presp.getStatus() != "OK": # if the response was not OK, the upstream server thinks we're # dead and has signaled that to the originating server. We # should just die now. faulty = presp.getData() log.info("Error from heartbeat request. Stopping %s" % str(faulty)) #log.error("Got error from heartbeat request. Stopping worker.") if (type(faulty) == type(dict()) and 'faulty' in faulty): for faultyItem in faulty['faulty']: self.worker.killWorkload(faultyItem) else: pass #sys.exit(1) respData = presp.getData() if type(respData) == type(dict()): rettime = int(respData['heartbeat-time']) self.randomFile = respData['random-file'] self._createRandomFile() else: rettime = int(respData) #rettime=int(presp.getData()) log.debug("Waiting %s seconds for next ping" % (rettime)) return rettime
def run(self, serverState, request, response): workerID=request.getParam('worker_id') workerDir=request.getParam('worker_dir') iteration=request.getParam('iteration') itemsXML=request.getParam('heartbeat_items') version=0 if request.hasParam('version'): version=int(request.getParam('version')) hwr=cpc.command.heartbeat.HeartbeatItemReader() hwr.readString(itemsXML, "worker heartbeat items") heartbeatItems=hwr.getItems() # The worker data list workerDataList=serverState.getWorkerDataList() haveADir=False # Order the heartbeat items by destination server destList={} Nhandled=0 for item in heartbeatItems: dest=item.getServerName() item.checkRunDir() if item.getHaveRunDir(): haveADir=True if dest in destList: destList[dest].append(item) else: destList[dest]=[item] Nhandled+=1 if haveADir: if iteration!="final": workerDataList.add(workerDir) if iteration=="final": workerDataList.remove(workerDir) # get my own name to compare selfNode= Node.getSelfNode(serverState.conf) selfName = selfNode.getId() #updating the status at every hearbeat. This is how we knwo that the worker # is still talking to the server serverState.setWorkerState(WorkerStatus.WORKER_STATUS_CONNECTED,workerID, request.headers['originating-client']) # now iterate over the destinations, and send them their heartbeat # items. # Once we have many workers, this would be a place to pool heartbeat # items and send them as one big request. faultyItems=[] for dest, items in destList.iteritems(): if dest == selfName: ret=serverState.getRunningCmdList().ping(workerID, workerDir, iteration, items, True, faultyItems) else: msg=ServerMessage(dest) co=StringIO() co.write('<heartbeat worker_id="%s" worker_server_id="%s">'% (workerID, selfName)) for item in items: item.writeXML(co) co.write('</heartbeat>') resp = msg.heartbeatForwardedRequest(workerID, workerDir, selfName, iteration, co.getvalue()) presp=ProcessedResponse(resp) if presp.getStatus() != "OK": log.info("Heartbeat response from %s not OK"%dest) retitems=presp.getData() for item in retitems: faultyItems.append(item) if version > 1: retData = { 'heartbeat-time' : serverState.conf. getHeartbeatTime(), 'random-file': workerDataList.getRnd(workerDir) } else: retData=serverState.conf.getHeartbeatTime() if len(faultyItems)==0: response.add('', data=retData) else: if version > 1: retData['faulty']=faultyItems # TODO: per-workload error reporting response.add('Heatbeat NOT OK', status="ERROR", data=retData) log.info("Handled %d heartbeat signal items."%(Nhandled))