def getCommandOutputData(cmdID, workerServer): log.log(cpc.util.log.TRACE,"Trying to pull command output from %s"% workerServer) s2smsg=ServerMessage(workerServer) rundata_response = s2smsg.pullAssetRequest(cmdID, Asset.cmdOutput()) if rundata_response.getType() != "application/x-tar": log.error("Incorrect response type: %s, should be %s"% (rundata_response.getType(), 'application/x-tar')) if rundata_response.getType() == "text/json": errormsg=rundata_response.message.read(len(rundata_response. message)) presp=ProcessedResponse(rundata_response) if not presp.isOK(): log.error('Response from worker server not OK: %s'% errormsg) else: s2smsg.clearAssetRequest(cmdID) log.log(cpc.util.log.TRACE, "Successfully pulled command output data from %s."% workerServer) return rundata_response #runfile = rundata_response.getRawData() #this doesnt work because the mmap closes as it is returned return None
def _fetchRemoteRunFiles(self, rc): """Get the result files from a remote run directory to a local command directory. Return true if successful. May throw exception in case of failure""" if rc.haveData: log.debug("Fetching remote results directory %s to %s"% (rc.runDir, rc.cmd.getDir())) # the data is remote: we must fetch data through a # server-to-server command. msg=ServerMessage(rc.workerServer) resp=msg.deadWorkerFetchRequest(rc.workerDir, rc.runDir) if resp.getType() == "application/x-tar": # untar the return data and use it. runfile=resp.getRawData() log.debug("extracting file for %s to dir %s"% (rc.cmd.id,rc.cmd.getDir())) cpc.util.file.extractSafely(rc.cmd.getDir(), fileobj=runfile) return True return False
def _fetchRemoteRunFiles(self, rc): """Get the result files from a remote run directory to a local command directory. Return true if successful. May throw exception in case of failure""" if rc.haveData: log.debug("Fetching remote results directory %s to %s" % (rc.runDir, rc.cmd.getDir())) # the data is remote: we must fetch data through a # server-to-server command. msg = ServerMessage(rc.workerServer) resp = msg.deadWorkerFetchRequest(rc.workerDir, rc.runDir) if resp.getType() == "application/x-tar": # untar the return data and use it. runfile = resp.getRawData() log.debug("extracting file for %s to dir %s" % (rc.cmd.id, rc.cmd.getDir())) cpc.util.file.extractSafely(rc.cmd.getDir(), fileobj=runfile) return True return False
def run(self, serverState, request, response): # first read platform capabilities and executables rdr=cpc.command.platform_exec_reader.PlatformExecutableReader() workerData=request.getParam('worker') if request.hasParam('worker-id'): workerID=request.getParam('worker-id') else: workerID='(none)' log.debug("Worker platform + executables: %s"%workerData) rdr.readString(workerData,"Worker-reported platform + executables") # match queued commands to executables. cwm=CommandWorkerMatcher(rdr.getPlatforms(), rdr.getExecutableList(), rdr.getWorkerRequirements()) cmds=cwm.getWork(serverState.getCmdQueue()) if not cwm.isDepleted(): # now sleep for 5 seconds to give the dataflow time to react to any # new state. time.sleep(5) cmds.extend(cwm.getWork(serverState.getCmdQueue())) # now check the forwarded variables conf=serverState.conf originatingServer=None heartbeatInterval=None try: # check whether there is an originating server. If not, we're it if self.forwarded: if 'originating-server-id' in request.headers: originatingServer = request.headers['originating-server-id'] # check the expected heartbeat time. log.debug("Forwarded message") if request.hasParam('heartbeat-interval'): heartbeatInterval = int(request.getParam('heartbeat-interval')) log.debug("Forwarded heartbeat interval is %d"% heartbeatInterval) except NameError: # self.forwarded does not exist. Treat it as if self.forwarded == False pass if originatingServer is None: # If the originating server property has not been set, the # request hasn't been forwarded, therefore we are the originating # server selfNode=Node.getSelfNode(conf) originatingServer = selfNode.getId() # we only store worker state in the server the worker connects to serverState.setWorkerState(WorkerStatus.WORKER_STATUS_CONNECTED,workerID, request.headers['originating-client']) if heartbeatInterval is None: heartbeatInterval = conf.getHeartbeatTime() log.debug("worker identified %s"%request.headers['originating-client'] ) if len(cmds) > 0: # first add them to the running list so they never get lost runningCmdList=serverState.getRunningCmdList() runningCmdList.add(cmds, originatingServer, heartbeatInterval) # construct the tar file with the workloads. tff=tempfile.TemporaryFile() tf=tarfile.open(fileobj=tff, mode="w:gz") # make the commands ready for cmd in cmds: log.debug("Adding command id %s to tar file."%cmd.id) # write the command description to the command's directory task=cmd.getTask() #log.debug(cmd) project=task.getProject() taskDir = "task_%s"%task.getID() cmddir=cmd.getDir() if not os.path.exists(cmddir): log.debug("cmddir %s did not exist. Created directory."%cmd.id) os.mkdir(cmddir) arcdir="%s"%(cmd.id) log.debug("cmddir=%s"%cmddir) outf=open(os.path.join(cmddir, "command.xml"), "w") cmd.writeWorkerXML(outf) outf.close() tf.add(cmddir, arcname=arcdir, recursive=True) # set the state of the command. tf.close() del(tf) tff.seek(0) # now send it back response.setFile(tff,'application/x-tar') #project.writeTasks() # the file is closed after the response is sent. log.info("Did direct worker-ready") else: nodes = conf.getNodes().getNodesByPriority() topology = Nodes() if request.hasParam('topology'): topology = json.loads(request.getParam('topology') ,object_hook = json_serializer.fromJson) thisNode = Node.getSelfNode(conf) thisNode.nodes = conf.getNodes() topology.addNode(thisNode) hasJob =False # temporary flag that should be removed for node in nodes: if topology.exists(node.getId()) == False: clnt=ServerMessage(node.getId()) clientResponse=clnt.workerReadyForwardedRequest(workerID, workerData, topology, originatingServer, heartbeatInterval, request.headers['originating-client']) if clientResponse.getType() == 'application/x-tar': log.log(cpc.util.log.TRACE, 'got work from %s'% (clientResponse.headers[ 'originating-server-id'])) hasJob=True # we need to rewrap the message #TODO stupid intermediary step because the mmap form # clientresponse is prematurely closed tmp = tempfile.TemporaryFile('w+b') message = clientResponse.getRawData() tmp.write(message.read(len(message))) tmp.seek(0) #for key in clientResponse.headers: # print "%s:%s"%(key,clientResponse.headers[key]) response.setFile(tmp,'application/x-tar') response.headers['originating-server-id']=\ clientResponse.headers[ 'originating-server-id'] #OPTIMIZE leads to a lot of folding and unfolding of #packages if not hasJob: response.add("No command") log.info("Did delegated worker-ready")
def run(self, serverState, request, response): workerID=request.getParam('worker_id') workerDir=request.getParam('worker_dir') iteration=request.getParam('iteration') itemsXML=request.getParam('heartbeat_items') version=0 if request.hasParam('version'): version=int(request.getParam('version')) hwr=cpc.command.heartbeat.HeartbeatItemReader() hwr.readString(itemsXML, "worker heartbeat items") heartbeatItems=hwr.getItems() # The worker data list workerDataList=serverState.getWorkerDataList() haveADir=False # Order the heartbeat items by destination server destList={} Nhandled=0 for item in heartbeatItems: dest=item.getServerName() item.checkRunDir() if item.getHaveRunDir(): haveADir=True if dest in destList: destList[dest].append(item) else: destList[dest]=[item] Nhandled+=1 if haveADir: if iteration!="final": workerDataList.add(workerDir) if iteration=="final": workerDataList.remove(workerDir) # get my own name to compare selfNode= Node.getSelfNode(serverState.conf) selfName = selfNode.getId() #updating the status at every hearbeat. This is how we knwo that the worker # is still talking to the server serverState.setWorkerState(WorkerStatus.WORKER_STATUS_CONNECTED,workerID, request.headers['originating-client']) # now iterate over the destinations, and send them their heartbeat # items. # Once we have many workers, this would be a place to pool heartbeat # items and send them as one big request. faultyItems=[] for dest, items in destList.iteritems(): if dest == selfName: ret=serverState.getRunningCmdList().ping(workerID, workerDir, iteration, items, True, faultyItems) else: msg=ServerMessage(dest) co=StringIO() co.write('<heartbeat worker_id="%s" worker_server_id="%s">'% (workerID, selfName)) for item in items: item.writeXML(co) co.write('</heartbeat>') resp = msg.heartbeatForwardedRequest(workerID, workerDir, selfName, iteration, co.getvalue()) presp=ProcessedResponse(resp) if presp.getStatus() != "OK": log.info("Heartbeat response from %s not OK"%dest) retitems=presp.getData() for item in retitems: faultyItems.append(item) if version > 1: retData = { 'heartbeat-time' : serverState.conf. getHeartbeatTime(), 'random-file': workerDataList.getRnd(workerDir) } else: retData=serverState.conf.getHeartbeatTime() if len(faultyItems)==0: response.add('', data=retData) else: if version > 1: retData['faulty']=faultyItems # TODO: per-workload error reporting response.add('Heatbeat NOT OK', status="ERROR", data=retData) log.info("Handled %d heartbeat signal items."%(Nhandled))
def runLocal(self, serverState, request, response): #self.lock = threading.Lock() cmdID=request.getParam('cmd_id') selfNode=Node.getSelfNode(serverState.conf) selfName = selfNode.getId() # get the source server if set. If not set, it means that this server # is the worker server. if request.hasParam('worker_server'): workerServer=request.getParam('worker_server') else: workerServer=selfName # get the destination server if set if request.hasParam('project_server'): projServer=request.getParam('project_server') else: # for backward compatibility, we assume that we are the project # server if it's forwarded. If not, there's something wrong. projServer=selfName if not self.forwarded: raise CommandFinishError( "no project server set in command finished request.") returncode=None if request.hasParam('return_code'): returncode=int(request.getParam('return_code')) cputime=0 if request.hasParam('used_cpu_time'): cputime=float(request.getParam('used_cpu_time')) runfile=None if request.haveFile('run_data'): runfile=request.getFile('run_data') elif request.haveFile('rundata'): # backward compatibility runfile=request.getFile('rundata') if projServer != selfName: # forward the request using remote assets. Note that the workers # usually don't take this path anyway and forward directly to the # project server. This might change in the futuure. # TODO: some sort of verification to check whether this was in fact # the client that we sent the command to serverState.getLocalAssets().addCmdOutputAsset(cmdID, projServer, runfile) #forward CommandFinished-signal to project server msg=ServerMessage(projServer) ret = msg.commandFinishedForwardedRequest(cmdID, workerServer, projServer, returncode, cputime, runfile is not None) else: # handle the input locally. # get the remote asset if it exists if ( workerServer is not None and runfile is None and ( request.hasParam('run_data') and int(request.getParam('run_data'))!=0 ) ): #remote asset tracking log.info("Pulling asset from %s"%workerServer) serverState.getRemoteAssets().addAsset(cmdID, workerServer) #for now, get the command data output immediately rundata = Tracker.getCommandOutputData(cmdID, workerServer) if rundata != None: runfile = rundata.getRawData() # now handle the finished command. runningCmdList=serverState.getRunningCmdList() runningCmdList.handleFinished(cmdID, returncode, cputime, runfile)