def recordInCouch(self, jobs, newstate, oldstate, updatesummary = False): """ _recordInCouch_ Record relevant job information in couch. If the job does not yet exist in couch it will be saved as a seperate document. If the job has a FWJR attached that will be saved as a seperate document. """ if not self._connectDatabases(): logging.error('Databases not connected properly') return timestamp = int(time.time()) couchRecordsToUpdate = [] for job in jobs: couchDocID = job.get("couch_record", None) if newstate == "new": oldstate = "none" if job.get("site_cms_name", None): if newstate == "executing": jobLocation = job["site_cms_name"] else: jobLocation = "Agent" else: jobLocation = "Agent" if couchDocID == None: jobDocument = {} jobDocument["_id"] = str(job["id"]) job["couch_record"] = jobDocument["_id"] jobDocument["jobid"] = job["id"] jobDocument["workflow"] = job["workflow"] jobDocument["task"] = job["task"] jobDocument["owner"] = job["owner"] jobDocument["inputfiles"] = [] for inputFile in job["input_files"]: docInputFile = inputFile.json() docInputFile["parents"] = [] for parent in inputFile["parents"]: docInputFile["parents"].append({"lfn": parent["lfn"]}) jobDocument["inputfiles"].append(docInputFile) jobDocument["states"] = {"0": {"oldstate": oldstate, "newstate": newstate, "location": jobLocation, "timestamp": timestamp}} jobDocument["jobgroup"] = job["jobgroup"] jobDocument["mask"] = {"FirstEvent": job["mask"]["FirstEvent"], "LastEvent": job["mask"]["LastEvent"], "FirstLumi": job["mask"]["FirstLumi"], "LastLumi": job["mask"]["LastLumi"], "FirstRun": job["mask"]["FirstRun"], "LastRun": job["mask"]["LastRun"]} if job['mask']['runAndLumis'] != {}: # Then we have to save the mask runAndLumis jobDocument['mask']['runAndLumis'] = {} for key in job['mask']['runAndLumis'].keys(): jobDocument['mask']['runAndLumis'][str(key)] = job['mask']['runAndLumis'][key] jobDocument["name"] = job["name"] jobDocument["type"] = "job" jobDocument["user"] = job.get("user", None) jobDocument["group"] = job.get("group", None) jobDocument["taskType"] = job.get("taskType", "Unknown") jobDocument["jobType"] = job.get("jobType", "Unknown") couchRecordsToUpdate.append({"jobid": job["id"], "couchid": jobDocument["_id"]}) self.jobsdatabase.queue(jobDocument, callback = discardConflictingDocument) else: # We send a PUT request to the stateTransition update handler. # Couch expects the parameters to be passed as arguments to in # the URI while the Requests class will only encode arguments # this way for GET requests. Changing the Requests class to # encode PUT arguments as couch expects broke a bunch of code so # we'll just do our own encoding here. updateUri = "/" + self.jobsdatabase.name + "/_design/JobDump/_update/stateTransition/" + couchDocID updateUri += "?oldstate=%s&newstate=%s&location=%s×tamp=%s" % (oldstate, newstate, jobLocation, timestamp) self.jobsdatabase.makeRequest(uri = updateUri, type = "PUT", decode = False) # updating the status of the summary doc only when it is explicitely requested # doc is already in couch if updatesummary: jobSummaryId = job["name"] updateUri = "/" + self.jsumdatabase.name + "/_design/WMStats/_update/jobSummaryState/" + jobSummaryId # map retrydone state to jobfailed state for monitoring if newstate == "retrydone": monitorState = "jobfailed" else: monitorState = newstate updateUri += "?newstate=%s×tamp=%s" % (monitorState, timestamp) self.jsumdatabase.makeRequest(uri = updateUri, type = "PUT", decode = False) logging.debug("Updated job summary status for job %s" % jobSummaryId) updateUri = "/" + self.jsumdatabase.name + "/_design/WMStats/_update/jobStateTransition/" + jobSummaryId updateUri += "?oldstate=%s&newstate=%s&location=%s×tamp=%s" % (oldstate, monitorState, job["location"], timestamp) self.jsumdatabase.makeRequest(uri = updateUri, type = "PUT", decode = False) logging.debug("Updated job summary state history for job %s" % jobSummaryId) if job.get("fwjr", None): # If there are too many input files, strip them out # of the FWJR, as they should already # be in the database # This is not critical try: if len(job['fwjr'].getAllInputFiles()) > self.maxUploadedInputFiles: job['fwjr'].stripInputFiles() except: logging.error("Error while trying to strip input files from FWJR. Ignoring.") pass # complete fwjr document job["fwjr"].setTaskName(job["task"]) fwjrDocument = {"_id": "%s-%s" % (job["id"], job["retry_count"]), "jobid": job["id"], "retrycount": job["retry_count"], "fwjr": job["fwjr"].__to_json__(None), "type": "fwjr"} self.fwjrdatabase.queue(fwjrDocument, timestamp = True, callback = discardConflictingDocument) updateSummaryDB(self.statsumdatabase, job) #TODO: can add config switch to swich on and off # if self.config.JobSateMachine.propagateSuccessJobs or (job["retry_count"] > 0) or (newstate != 'success'): if (job["retry_count"] > 0) or (newstate != 'success'): jobSummaryId = job["name"] # building a summary of fwjr logging.debug("Pushing job summary for job %s" % jobSummaryId) errmsgs = {} inputs = [] if "steps" in fwjrDocument["fwjr"]: for step in fwjrDocument["fwjr"]["steps"]: if "errors" in fwjrDocument["fwjr"]["steps"][step]: errmsgs[step] = [error for error in fwjrDocument["fwjr"]["steps"][step]["errors"]] if "input" in fwjrDocument["fwjr"]["steps"][step] and "source" in fwjrDocument["fwjr"]["steps"][step]["input"]: inputs.extend( [source["runs"] for source in fwjrDocument["fwjr"]['steps'][step]["input"]["source"] if "runs" in source] ) outputs = [] outputDataset = None for singlestep in job["fwjr"].listSteps(): for singlefile in job["fwjr"].getAllFilesFromStep(step=singlestep): if singlefile: outputs.append({'type': 'output' if CMSSTEP.match(singlestep) else singlefile.get('module_label', None), 'lfn': singlefile.get('lfn', None), 'location': list(singlefile.get('locations', set([]))) if len(singlefile.get('locations', set([]))) > 1 else singlefile['locations'].pop(), 'checksums': singlefile.get('checksums', {}), 'size': singlefile.get('size', None) }) #it should have one output dataset for all the files outputDataset = singlefile.get('dataset', None) if not outputDataset else outputDataset inputFiles = [] for inputFileStruct in job["fwjr"].getAllInputFiles(): # check if inputFileSummary needs to be extended inputFileSummary = {} inputFileSummary["lfn"] = inputFileStruct["lfn"] inputFileSummary["input_type"] = inputFileStruct["input_type"] inputFiles.append(inputFileSummary) # Don't record intermediate jobfailed status in the jobsummary # change to jobcooloff which will be overwritten by error handler anyway if (job["retry_count"] > 0) and (newstate == 'jobfailed'): summarystate = 'jobcooloff' else: summarystate = newstate jobSummary = {"_id": jobSummaryId, "wmbsid": job["id"], "type": "jobsummary", "retrycount": job["retry_count"], "workflow": job["workflow"], "task": job["task"], "jobtype": job["jobType"], "state": summarystate, "site": job.get("location", None), "cms_location": job["fwjr"].getSiteName(), "exitcode": job["fwjr"].getExitCode(), "errors": errmsgs, "lumis": inputs, "outputdataset": outputDataset, "inputfiles": inputFiles, "acdc_url": "%s/%s" % (sanitizeURL(self.config.ACDC.couchurl)['url'], self.config.ACDC.database), "agent_name": self.config.Agent.hostName, "output": outputs } if couchDocID is not None: try: currentJobDoc = self.jsumdatabase.document(id = jobSummaryId) jobSummary['_rev'] = currentJobDoc['_rev'] jobSummary['state_history'] = currentJobDoc.get('state_history', []) # record final status transition if newstate == 'success': finalStateDict = {'oldstate': oldstate, 'newstate': newstate, 'location': job["location"], 'timestamp': timestamp} jobSummary['state_history'].append(finalStateDict) noEmptyList = ["inputfiles", "lumis"] for prop in noEmptyList: jobSummary[prop] = jobSummary[prop] if jobSummary[prop] else currentJobDoc.get(prop, []) except CouchNotFoundError: pass self.jsumdatabase.queue(jobSummary, timestamp = True) if len(couchRecordsToUpdate) > 0: self.setCouchDAO.execute(bulkList = couchRecordsToUpdate, conn = self.getDBConn(), transaction = self.existingTransaction()) self.jobsdatabase.commit(callback = discardConflictingDocument) self.fwjrdatabase.commit(callback = discardConflictingDocument) self.jsumdatabase.commit() return
def recordInCouch(self, jobs, newstate, oldstate, updatesummary=False): """ _recordInCouch_ Record relevant job information in couch. If the job does not yet exist in couch it will be saved as a seperate document. If the job has a FWJR attached that will be saved as a seperate document. """ if not self._connectDatabases(): logging.error('Databases not connected properly') return timestamp = int(time.time()) couchRecordsToUpdate = [] for job in jobs: couchDocID = job.get("couch_record", None) if newstate == "new": oldstate = "none" if job.get("site_cms_name", None): if newstate == "executing": jobLocation = job["site_cms_name"] else: jobLocation = "Agent" else: jobLocation = "Agent" if couchDocID is None: jobDocument = {} jobDocument["_id"] = str(job["id"]) job["couch_record"] = jobDocument["_id"] jobDocument["jobid"] = job["id"] jobDocument["workflow"] = job["workflow"] jobDocument["task"] = job["task"] jobDocument["owner"] = job["owner"] jobDocument["inputfiles"] = [] for inputFile in job["input_files"]: docInputFile = inputFile.json() docInputFile["parents"] = [] for parent in inputFile["parents"]: docInputFile["parents"].append({"lfn": parent["lfn"]}) jobDocument["inputfiles"].append(docInputFile) jobDocument["states"] = {"0": {"oldstate": oldstate, "newstate": newstate, "location": jobLocation, "timestamp": timestamp}} jobDocument["jobgroup"] = job["jobgroup"] jobDocument["mask"] = {"FirstEvent": job["mask"]["FirstEvent"], "LastEvent": job["mask"]["LastEvent"], "FirstLumi": job["mask"]["FirstLumi"], "LastLumi": job["mask"]["LastLumi"], "FirstRun": job["mask"]["FirstRun"], "LastRun": job["mask"]["LastRun"]} if job['mask']['runAndLumis'] != {}: # Then we have to save the mask runAndLumis jobDocument['mask']['runAndLumis'] = {} for key in job['mask']['runAndLumis'].keys(): jobDocument['mask']['runAndLumis'][str(key)] = job['mask']['runAndLumis'][key] jobDocument["name"] = job["name"] jobDocument["type"] = "job" jobDocument["user"] = job.get("user", None) jobDocument["group"] = job.get("group", None) jobDocument["taskType"] = job.get("taskType", "Unknown") jobDocument["jobType"] = job.get("jobType", "Unknown") couchRecordsToUpdate.append({"jobid": job["id"], "couchid": jobDocument["_id"]}) self.jobsdatabase.queue(jobDocument, callback=discardConflictingDocument) else: # We send a PUT request to the stateTransition update handler. # Couch expects the parameters to be passed as arguments to in # the URI while the Requests class will only encode arguments # this way for GET requests. Changing the Requests class to # encode PUT arguments as couch expects broke a bunch of code so # we'll just do our own encoding here. updateUri = "/" + self.jobsdatabase.name + "/_design/JobDump/_update/stateTransition/" + couchDocID updateUri += "?oldstate=%s&newstate=%s&location=%s×tamp=%s" % (oldstate, newstate, jobLocation, timestamp) self.jobsdatabase.makeRequest(uri=updateUri, type="PUT", decode=False) # updating the status of the summary doc only when it is explicitely requested # doc is already in couch if updatesummary: jobSummaryId = job["name"] updateUri = "/" + self.jsumdatabase.name + "/_design/WMStatsAgent/_update/jobSummaryState/" + jobSummaryId # map retrydone state to jobfailed state for monitoring if newstate == "retrydone": monitorState = "jobfailed" else: monitorState = newstate updateUri += "?newstate=%s×tamp=%s" % (monitorState, timestamp) self.jsumdatabase.makeRequest(uri=updateUri, type="PUT", decode=False) logging.debug("Updated job summary status for job %s", jobSummaryId) updateUri = "/" + self.jsumdatabase.name + "/_design/WMStatsAgent/_update/jobStateTransition/" + jobSummaryId updateUri += "?oldstate=%s&newstate=%s&location=%s×tamp=%s" % (oldstate, monitorState, job["location"], timestamp) self.jsumdatabase.makeRequest(uri=updateUri, type="PUT", decode=False) logging.debug("Updated job summary state history for job %s", jobSummaryId) if job.get("fwjr", None): cachedByWorkflow = self.workloadCache.setdefault(job['workflow'], getDataFromSpecFile( self.getWorkflowSpecDAO.execute(job['task'])[ job['task']]['spec'])) job['fwjr'].setCampaign(cachedByWorkflow.get('Campaign', '')) job['fwjr'].setPrepID(cachedByWorkflow.get(job['task'], '')) # If there are too many input files, strip them out # of the FWJR, as they should already # be in the database # This is not critical try: if len(job['fwjr'].getAllInputFiles()) > self.maxUploadedInputFiles: job['fwjr'].stripInputFiles() except Exception as ex: logging.error("Error while trying to strip input files from FWJR. Ignoring. : %s", str(ex)) if newstate == "retrydone": jobState = "jobfailed" else: jobState = newstate # there is race condition updating couch record location and job is completed. # for the fast fail job, it could miss the location update job["location"] = job["fwjr"].getSiteName() or job.get("location", "Unknown") # complete fwjr document job["fwjr"].setTaskName(job["task"]) jsonFWJR = job["fwjr"].__to_json__(None) # Don't archive cleanup job report if job["jobType"] == "Cleanup": archStatus = "skip" else: archStatus = "ready" fwjrDocument = {"_id": "%s-%s" % (job["id"], job["retry_count"]), "jobid": job["id"], "jobtype": job["jobType"], "jobstate": jobState, "retrycount": job["retry_count"], "archivestatus": archStatus, "fwjr": jsonFWJR, "type": "fwjr"} self.fwjrdatabase.queue(fwjrDocument, timestamp=True, callback=discardConflictingDocument) updateSummaryDB(self.statsumdatabase, job) # TODO: can add config switch to swich on and off # if self.config.JobSateMachine.propagateSuccessJobs or (job["retry_count"] > 0) or (newstate != 'success'): if (job["retry_count"] > 0) or (newstate != 'success'): jobSummaryId = job["name"] # building a summary of fwjr logging.debug("Pushing job summary for job %s", jobSummaryId) errmsgs = {} inputs = [] if "steps" in fwjrDocument["fwjr"]: for step in fwjrDocument["fwjr"]["steps"]: if "errors" in fwjrDocument["fwjr"]["steps"][step]: errmsgs[step] = [error for error in fwjrDocument["fwjr"]["steps"][step]["errors"]] if "input" in fwjrDocument["fwjr"]["steps"][step] and "source" in \ fwjrDocument["fwjr"]["steps"][step]["input"]: inputs.extend( [source["runs"] for source in fwjrDocument["fwjr"]['steps'][step]["input"]["source"] if "runs" in source]) outputs = [] outputDataset = None for singlestep in job["fwjr"].listSteps(): for singlefile in job["fwjr"].getAllFilesFromStep(step=singlestep): if singlefile: if len(singlefile.get('locations', set())) > 1: locations = list(singlefile.get('locations')) elif singlefile.get('locations'): locations = singlefile['locations'].pop() else: locations = set() if CMSSTEP.match(singlestep): outType = 'output' else: outType = singlefile.get('module_label', None) outputs.append({'type': outType, 'lfn': singlefile.get('lfn', None), 'location': locations, 'checksums': singlefile.get('checksums', {}), 'size': singlefile.get('size', None)}) # it should have one output dataset for all the files outputDataset = singlefile.get('dataset', None) if not outputDataset else outputDataset inputFiles = [] for inputFileStruct in job["fwjr"].getAllInputFiles(): # check if inputFileSummary needs to be extended inputFileSummary = {} inputFileSummary["lfn"] = inputFileStruct["lfn"] inputFileSummary["input_type"] = inputFileStruct["input_type"] inputFiles.append(inputFileSummary) # Don't record intermediate jobfailed status in the jobsummary # change to jobcooloff which will be overwritten by error handler anyway if (job["retry_count"] > 0) and (newstate == 'jobfailed'): summarystate = 'jobcooloff' else: summarystate = newstate jobSummary = {"_id": jobSummaryId, "wmbsid": job["id"], "type": "jobsummary", "retrycount": job["retry_count"], "workflow": job["workflow"], "task": job["task"], "jobtype": job["jobType"], "state": summarystate, "site": job.get("location", None), "cms_location": job["fwjr"].getSiteName(), "exitcode": job["fwjr"].getExitCode(), "eos_log_url": job["fwjr"].getLogURL(), "worker_node_info": job["fwjr"].getWorkerNodeInfo(), "errors": errmsgs, "lumis": inputs, "outputdataset": outputDataset, "inputfiles": inputFiles, "acdc_url": "%s/%s" % ( sanitizeURL(self.config.ACDC.couchurl)['url'], self.config.ACDC.database), "agent_name": self.config.Agent.hostName, "output": outputs} if couchDocID is not None: try: currentJobDoc = self.jsumdatabase.document(id=jobSummaryId) jobSummary['_rev'] = currentJobDoc['_rev'] jobSummary['state_history'] = currentJobDoc.get('state_history', []) # record final status transition if newstate == 'success': finalStateDict = {'oldstate': oldstate, 'newstate': newstate, 'location': job["location"], 'timestamp': timestamp} jobSummary['state_history'].append(finalStateDict) noEmptyList = ["inputfiles", "lumis"] for prop in noEmptyList: jobSummary[prop] = jobSummary[prop] if jobSummary[prop] else currentJobDoc.get(prop, []) except CouchNotFoundError: pass self.jsumdatabase.queue(jobSummary, timestamp=True) if len(couchRecordsToUpdate) > 0: self.setCouchDAO.execute(bulkList=couchRecordsToUpdate, conn=self.getDBConn(), transaction=self.existingTransaction()) self.jobsdatabase.commit(callback=discardConflictingDocument) self.fwjrdatabase.commit(callback=discardConflictingDocument) self.jsumdatabase.commit() return