def execute(self): #Have to work out if the module is part of the last step i.e. #user jobs can have any number of steps and we only want #to run the finalization once. currentStep = int(self.step_commons['STEP_NUMBER']) totalSteps = int(self.workflow_commons['TotalSteps']) if currentStep==totalSteps: self.lastStep=True else: self.log.verbose('Current step = %s, total steps of workflow = %s, HandleProdOutputData will enable itself only at the last workflow step.' %(currentStep,totalSteps)) if not self.lastStep: return S_OK() self.result =self.resolveInputVariables() if not self.result['OK']: self.log.error(self.result['Message']) return self.result ###Instantiate object that will ensure that the files are registered properly failoverTransfer = FailoverTransfer(self.request) datatohandle = {} if self.generatorfile: if not os.path.exists(self.generatorfile): return S_ERROR("File %s does not exist, something went wrong before !"%(self.generatorfile)) self.attributesdict['DataType'] = 'gen' lfnpath = string.join([self.basepath,self.attributesdict['Machine'],self.attributesdict['Energy'], self.attributesdict['DataType'],self.attributesdict['EvtType'],self.attributesdict['ProdID'], self.generatorfile],"/") datatohandle[self.generatorfile]={'lfn':lfnpath,'type':'gen','workflowSE':self.destination} if self.mokkafile or self.slicfile: recofile = '' if self.mokkafile and not os.path.exists(self.mokkafile): return S_ERROR("File %s does not exist, something went wrong before !"%(self.mokkafile)) else: recofile = self.mokkafile if self.slicfile and not os.path.exists(self.slicfile): return S_ERROR("File %s does not exist, something went wrong before !"%(self.slicfile)) else: recofile = self.slicfile self.attributesdict['DataType'] = 'SIM' lfnpath = string.join([self.basepath,self.attributesdict['Machine'],self.attributesdict['Energy'], self.attributesdict['DetectorModel'],self.attributesdict['DataType'],self.attributesdict['EvtType'], self.attributesdict['ProdID'],recofile],"/") datatohandle[recofile]={'lfn':lfnpath,'type':'gen','workflowSE':self.destination} ##Below, look in file name if it contain REC or DST, to determine the data type. if self.marlinfiles: for file in self.marlinfiles: if file.find("REC")>-1: self.attributesdict['DataType'] = 'REC' if file.find("DST")>-1: self.attributesdict['DataType'] = 'DST' lfnpath = string.join([self.basepath,self.attributesdict['Machine'],self.attributesdict['Energy'], self.attributesdict['DetectorModel'],self.attributesdict['DataType'],self.attributesdict['EvtType'], self.attributesdict['ProdID'],file],"/") datatohandle[file]={'lfn':lfnpath,'type':'gen','workflowSE':self.destination} if self.lcsimfiles: for file in self.lcsimfiles: if file.find("DST")>-1: self.attributesdict['DataType'] = 'DST' lfnpath = string.join([self.basepath,self.attributesdict['Machine'],self.attributesdict['Energy'], self.attributesdict['DetectorModel'],self.attributesdict['DataType'],self.attributesdict['EvtType'], self.attributesdict['ProdID'],file],"/") datatohandle[file]={'lfn':lfnpath,'type':'gen','workflowSE':self.destination} result = self.getFileMetadata(datatohandle) if not result['OK']: self.setApplicationStatus(result['Message']) return S_OK() fileMetadata = result['Value'] final = {} for fileName,metadata in fileMetadata.items(): final[fileName]=metadata final[fileName]['resolvedSE']=self.destination #One by one upload the files with failover if necessary replication = {} failover = {} uploaded = [] if not self.failoverTest: for fileName,metadata in final.items(): self.log.info("Attempting to store file %s to the following SE(s):\n%s" % (fileName, string.join(metadata['resolvedSE'],', '))) result = failoverTransfer.transferAndRegisterFile(fileName,metadata['localpath'],metadata['lfn'],metadata['resolvedSE'],fileGUID=metadata['guid'],fileCatalog=self.userFileCatalog) if not result['OK']: self.log.error('Could not transfer and register %s with metadata:\n %s' %(fileName,metadata)) failover[fileName]=metadata else: #Only attempt replication after successful upload lfn = metadata['lfn'] uploaded.append(lfn) seList = metadata['resolvedSE'] replicateSE = '' if result['Value'].has_key('uploadedSE'): uploadedSE = result['Value']['uploadedSE'] for se in seList: if not se == uploadedSE: replicateSE = se break if replicateSE and lfn: self.log.info('Will attempt to replicate %s to %s' %(lfn,replicateSE)) replication[lfn]=replicateSE else: failover = final cleanUp = False for fileName,metadata in failover.items(): random.shuffle(self.failoverSEs) targetSE = metadata['resolvedSE'][0] metadata['resolvedSE']=self.failoverSEs result = failoverTransfer.transferAndRegisterFileFailover(fileName,metadata['localpath'],metadata['lfn'],targetSE,metadata['resolvedSE'],fileGUID=metadata['guid'],fileCatalog=self.userFileCatalog) if not result['OK']: self.log.error('Could not transfer and register %s with metadata:\n %s' %(fileName,metadata)) cleanUp = True continue #for users can continue even if one completely fails else: lfn = metadata['lfn'] uploaded.append(lfn) #For files correctly uploaded must report LFNs to job parameters if uploaded: report = string.join( uploaded, ', ' ) self.jobReport.setJobParameter( 'UploadedOutputData', report ) #Now after all operations, retrieve potentially modified request object result = failoverTransfer.getRequestObject() if not result['OK']: self.log.error(result) return S_ERROR('Could Not Retrieve Modified Request') self.request = result['Value'] #If some or all of the files failed to be saved to failover if cleanUp: self.workflow_commons['Request']=self.request #Leave any uploaded files just in case it is useful for the user #do not try to replicate any files. return S_ERROR('Failed To Upload Output Data') return S_OK()
def finalize(self): """ finalize method performs final operations after all the job steps were executed. Only production jobs are treated. """ self.log.verbose("Starting UploadLogFile finalize") ########################################## # First determine the files which should be saved self.log.info("Determining the files to be saved in the logs.") res = self.determineRelevantFiles() if not res["OK"]: self.log.error("Completely failed to select relevant log files.", res["Message"]) return S_OK() # because if the logs are lost, it's not the end of the world. selectedFiles = res["Value"] self.log.info( "The following %s files were selected to be saved:\n%s" % (len(selectedFiles), string.join(selectedFiles, "\n")) ) ######################################### # Create a temporary directory containing these files self.log.info("Populating a temporary directory for selected files.") res = self.populateLogDirectory(selectedFiles) if not res["OK"]: self.log.error("Completely failed to populate temporary log file directory.", res["Message"]) self.setApplicationStatus("Failed To Populate Log Dir") return S_OK() # because if the logs are lost, it's not the end of the world. self.log.info("%s populated with log files." % self.logdir) ######################################### # Create a tailored index page # self.log.info('Creating an index page for the logs') # result = self.__createLogIndex(selectedFiles) # if not result['OK']: # self.log.error('Failed to create index page for logs', res['Message']) if not self.enable: self.log.info("Module is disabled by control flag") return S_OK("Module is disabled by control flag") ######################################### # Make sure all the files in the log directory have the correct permissions result = self.__setLogFilePermissions(self.logdir) if not result["OK"]: self.log.error("Could not set permissions of log files to 0755 with message:\n%s" % (result["Message"])) ######################################### # Attempt to uplaod logs to the LogSE self.log.info("Transferring log files to the %s" % self.logSE) res = S_ERROR() if not self.failoverTest: self.log.info("PutDirectory %s %s %s" % (self.logFilePath, os.path.realpath(self.logdir), self.logSE)) res = self.rm.putStorageDirectory( {self.logFilePath: os.path.realpath(self.logdir)}, self.logSE, singleDirectory=True ) self.log.verbose(res) if res["OK"]: self.log.info("Successfully upload log directory to %s" % self.logSE) # TODO: The logURL should be constructed using the LogSE and StorageElement() # storageElement = StorageElement(self.logSE) # pfn = storageElement.getPfnForLfn(self.logFilePath)['Value'] # logURL = getPfnForProtocol(res['Value'],'http')['Value'] logURL = "%s" % self.logFilePath self.setJobParameter("Log LFN", logURL) self.log.info("Logs for this job may be retrieved with dirac-ilc-get-prod-log -F %s" % logURL) return S_OK() ######################################### # Recover the logs to a failover storage element self.log.error( "Completely failed to upload log files to %s, will attempt upload to failover SE" % self.logSE, res["Message"], ) tarFileDir = os.path.dirname(self.logdir) self.logLFNPath = "%s.gz" % self.logLFNPath tarFileName = os.path.basename(self.logLFNPath) start = os.getcwd() os.chdir(self.logdir) logTarFiles = os.listdir(self.logdir) # comm = 'tar czvf %s %s' % (tarFileName,string.join(logTarFiles,' ')) tfile = tarfile.open(tarFileName, "w:gz") for item in logTarFiles: tfile.add(item) tfile.close() # res = shellCall(0,comm) if not os.path.exists(tarFileName): res = S_ERROR("File was not created") os.chdir(start) if not res["OK"]: self.log.error("Failed to create tar file from directory", "%s %s" % (self.logdir, res["Message"])) self.setApplicationStatus("Failed To Create Log Tar Dir") return S_OK() # because if the logs are lost, it's not the end of the world. # if res['Value'][0]: #i.e. non-zero status # self.log.error('Failed to create tar file from directory','%s %s' % (self.logdir,res['Value'])) # self.setApplicationStatus('Failed To Create Log Tar Dir') # return S_OK()#because if the logs are lost, it's not the end of the world. ############################################################ # Instantiate the failover transfer client with the global request object failoverTransfer = FailoverTransfer(self.request) ##determine the experiment self.failoverSEs = self.ops.getValue("Production/%s/FailOverSE" % self.experiment, self.failoverSEs) random.shuffle(self.failoverSEs) self.log.info( "Attempting to store file %s to the following SE(s):\n%s" % (tarFileName, string.join(self.failoverSEs, ", ")) ) result = failoverTransfer.transferAndRegisterFile( tarFileName, "%s/%s" % (tarFileDir, tarFileName), self.logLFNPath, self.failoverSEs, fileGUID=None, fileCatalog=["FileCatalog", "LcgFileCatalog"], ) if not result["OK"]: self.log.error("Failed to upload logs to all destinations") self.setApplicationStatus("Failed To Upload Logs") return S_OK() # because if the logs are lost, it's not the end of the world. # Now after all operations, retrieve potentially modified request object result = failoverTransfer.getRequestObject() if not result["OK"]: self.log.error(result) return S_ERROR("Could not retrieve modified request") self.request = result["Value"] res = self.createLogUploadRequest(self.logSE, self.logLFNPath) if not res["OK"]: self.log.error("Failed to create failover request", res["Message"]) self.setApplicationStatus("Failed To Upload Logs To Failover") else: self.log.info("Successfully created failover request") self.workflow_commons["Request"] = self.request return S_OK()
def execute(self): """ Main execution function. """ #Have to work out if the module is part of the last step i.e. #user jobs can have any number of steps and we only want #to run the finalization once. currentStep = int(self.step_commons['STEP_NUMBER']) totalSteps = int(self.workflow_commons['TotalSteps']) if currentStep == totalSteps: self.lastStep = True else: self.log.verbose('Current step = %s, total steps of workflow = %s, UserJobFinalization will enable itself only \ at the last workflow step.' % (currentStep, totalSteps)) if not self.lastStep: return S_OK() result = self.resolveInputVariables() if not result['OK']: self.log.error(result['Message']) return result self.log.info('Initializing %s' % self.version) if not self.workflowStatus['OK'] or not self.stepStatus['OK']: self.log.verbose('Workflow status = %s, step status = %s' % (self.workflowStatus['OK'], self.stepStatus['OK'])) return S_OK('No output data upload attempted') if not self.userOutputData: self.log.info('No user output data is specified for this job, nothing to do') return S_OK('No output data to upload') #Determine the final list of possible output files for the #workflow and all the parameters needed to upload them. outputList = [] for i in self.userOutputData: outputList.append({'outputPath' : string.upper(string.split(i, '.')[-1]), 'outputDataSE' : self.userOutputSE, 'outputFile' : os.path.basename(i)}) userOutputLFNs = [] if self.userOutputData: self.log.info('Constructing user output LFN(s) for %s' % (string.join(self.userOutputData, ', '))) if not self.jobID: self.jobID = 12345 owner = '' if self.workflow_commons.has_key('Owner'): owner = self.workflow_commons['Owner'] else: res = self.getCurrentOwner() if not res['OK']: return S_ERROR('Could not obtain owner from proxy') owner = res['Value'] vo = '' if self.workflow_commons.has_key('VO'): vo = self.workflow_commons['VO'] else: res = self.getCurrentVO() if not res['OK']: return S_ERROR('Could not obtain VO from proxy') vo = res['Value'] result = constructUserLFNs(int(self.jobID), vo, owner, self.userOutputData, self.userOutputPath) if not result['OK']: self.log.error('Could not create user LFNs', result['Message']) return result userOutputLFNs = result['Value'] self.log.verbose('Calling getCandidateFiles( %s, %s, %s)' % (outputList, userOutputLFNs, self.outputDataFileMask)) result = self.getCandidateFiles(outputList, userOutputLFNs, self.outputDataFileMask) if not result['OK']: if not self.ignoreapperrors: self.setApplicationStatus(result['Message']) return S_OK() fileDict = result['Value'] result = self.getFileMetadata(fileDict) if not result['OK']: if not self.ignoreapperrors: self.setApplicationStatus(result['Message']) return S_OK() if not result['Value']: if not self.ignoreapperrors: self.log.info('No output data files were determined to be uploaded for this workflow') self.setApplicationStatus('No Output Data Files To Upload') return S_OK() fileMetadata = result['Value'] #First get the local (or assigned) SE to try first for upload and others in random fashion result = getDestinationSEList('Tier1-USER', DIRAC.siteName(), outputmode='local') if not result['OK']: self.log.error('Could not resolve output data SE', result['Message']) self.setApplicationStatus('Failed To Resolve OutputSE') return result localSE = result['Value'] self.log.verbose('Site Local SE for user outputs is: %s' % (localSE)) orderedSEs = self.defaultOutputSE for se in localSE: if se in orderedSEs: orderedSEs.remove(se) for se in self.userOutputSE: if se in orderedSEs: orderedSEs.remove(se) orderedSEs = localSE + List.randomize(orderedSEs) if self.userOutputSE: prependSEs = [] for userSE in self.userOutputSE: if not userSE in orderedSEs: prependSEs.append(userSE) orderedSEs = prependSEs + orderedSEs self.log.info('Ordered list of output SEs is: %s' % (string.join(orderedSEs, ', '))) final = {} for fileName, metadata in fileMetadata.items(): final[fileName] = metadata final[fileName]['resolvedSE'] = orderedSEs #At this point can exit and see exactly what the module will upload if not self.enable: self.log.info('Module is disabled by control flag, would have attempted \ to upload the following files %s' % string.join(final.keys(), ', ')) for fileName, metadata in final.items(): self.log.info('--------%s--------' % fileName) for n, v in metadata.items(): self.log.info('%s = %s' %(n, v)) return S_OK('Module is disabled by control flag') #Instantiate the failover transfer client with the global request object failoverTransfer = FailoverTransfer(self.request) #One by one upload the files with failover if necessary replication = {} failover = {} uploaded = [] if not self.failoverTest: for fileName, metadata in final.items(): self.log.info("Attempting to store file %s to the following SE(s):\n%s" % (fileName, string.join(metadata['resolvedSE'], ', '))) result = failoverTransfer.transferAndRegisterFile(fileName, metadata['localpath'], metadata['lfn'], metadata['resolvedSE'], fileGUID = metadata['guid'], fileCatalog = self.userFileCatalog) if not result['OK']: self.log.error('Could not transfer and register %s with metadata:\n %s' % (fileName, metadata)) failover[fileName] = metadata else: #Only attempt replication after successful upload lfn = metadata['lfn'] uploaded.append(lfn) seList = metadata['resolvedSE'] replicateSE = '' if result['Value'].has_key('uploadedSE'): uploadedSE = result['Value']['uploadedSE'] for se in seList: if not se == uploadedSE: replicateSE = se break if replicateSE and lfn: self.log.info('Will attempt to replicate %s to %s' % (lfn, replicateSE)) replication[lfn] = replicateSE else: failover = final cleanUp = False for fileName, metadata in failover.items(): random.shuffle(self.failoverSEs) targetSE = metadata['resolvedSE'][0] metadata['resolvedSE'] = self.failoverSEs result = failoverTransfer.transferAndRegisterFileFailover(fileName, metadata['localpath'], metadata['lfn'], targetSE, metadata['resolvedSE'], fileGUID = metadata['guid'], fileCatalog = self.userFileCatalog) if not result['OK']: self.log.error('Could not transfer and register %s with metadata:\n %s' % (fileName, metadata)) cleanUp = True continue #for users can continue even if one completely fails else: lfn = metadata['lfn'] uploaded.append(lfn) #For files correctly uploaded must report LFNs to job parameters if uploaded: report = string.join( uploaded, ', ' ) self.jobReport.setJobParameter( 'UploadedOutputData', report ) #Now after all operations, retrieve potentially modified request object result = failoverTransfer.getRequestObject() if not result['OK']: self.log.error(result) return S_ERROR('Could Not Retrieve Modified Request') self.request = result['Value'] #If some or all of the files failed to be saved to failover if cleanUp: self.workflow_commons['Request'] = self.request #Leave any uploaded files just in case it is useful for the user #do not try to replicate any files. return S_ERROR('Failed To Upload Output Data') #If there is now at least one replica for uploaded files can trigger replication rm = ReplicaManager() self.log.info('Sleeping for 10 seconds before attempting replication of recently uploaded files') time.sleep(10) for lfn, repSE in replication.items(): result = rm.replicateAndRegister(lfn, repSE, catalog = self.userFileCatalog) if not result['OK']: self.log.info('Replication failed with below error but file already exists in Grid storage with \ at least one replica:\n%s' % (result)) self.workflow_commons['Request'] = self.request #Now must ensure if any pending requests are generated that these are propagated to the job wrapper reportRequest = None if self.jobReport: result = self.jobReport.generateRequest() if not result['OK']: self.log.warn('Could not generate request for job report with result:\n%s' % (result)) else: reportRequest = result['Value'] if reportRequest: self.log.info('Populating request with job report information') self.request.update(reportRequest) if not self.request.isEmpty()['Value']: request_string = self.request.toXML()['Value'] # Write out the request string fname = 'user_job_%s_request.xml' % (self.jobID) xmlfile = open(fname, 'w') xmlfile.write(request_string) xmlfile.close() self.log.info('Creating failover request for deferred operations for job %s:' % self.jobID) result = self.request.getDigest() if result['OK']: digest = result['Value'] self.log.info(digest) self.setApplicationStatus('Job Finished Successfully') return S_OK('Output data uploaded')
def execute(self): """ Main execution function. """ self.log.info('Initializing %s' % self.version) result = self.resolveInputVariables() if not result['OK']: self.log.error(result['Message']) return result if not self.workflowStatus['OK'] or not self.stepStatus['OK']: self.log.verbose('Workflow status = %s, step status = %s' % (self.workflowStatus['OK'], self.stepStatus['OK'])) return S_OK('No output data upload attempted') ##determine the experiment example_file = self.prodOutputLFNs[0] if "/ilc/prod/clic" in example_file: self.experiment = "CLIC" elif "/ilc/prod/ilc/sid" in example_file: self.experiment = 'ILC_SID' elif "/ilc/prod/ilc/mc-dbd" in example_file: self.experiment = 'ILC_ILD' else: self.log.warn("Failed to determine experiment, reverting to default") #Determine the final list of possible output files for the #workflow and all the parameters needed to upload them. result = self.getCandidateFiles(self.outputList, self.prodOutputLFNs, self.outputDataFileMask) if not result['OK']: self.setApplicationStatus(result['Message']) return result fileDict = result['Value'] result = self.getFileMetadata(fileDict) if not result['OK']: self.setApplicationStatus(result['Message']) return result if not result['Value']: self.log.info('No output data files were determined to be uploaded for this workflow') return S_OK() fileMetadata = result['Value'] #Get final, resolved SE list for files final = {} for fileName, metadata in fileMetadata.items(): result = getDestinationSEList(metadata['workflowSE'], DIRAC.siteName(), self.outputMode) if not result['OK']: self.log.error('Could not resolve output data SE', result['Message']) self.setApplicationStatus('Failed To Resolve OutputSE') return result resolvedSE = result['Value'] final[fileName] = metadata final[fileName]['resolvedSE'] = resolvedSE self.log.info('The following files will be uploaded: %s' % (string.join(final.keys(), ', '))) for fileName, metadata in final.items(): self.log.info('--------%s--------' % fileName) for n, v in metadata.items(): self.log.info('%s = %s' % (n, v)) #At this point can exit and see exactly what the module would have uploaded if not self.enable: self.log.info('Module is disabled by control flag, would have attempted to upload the \ following files %s' % string.join(final.keys(), ', ')) return S_OK('Module is disabled by control flag') #Disable the watchdog check in case the file uploading takes a long time self.log.info('Creating DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK in order to disable the Watchdog prior to upload') fopen = open('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK','w') fopen.write('%s' % time.asctime()) fopen.close() #Instantiate the failover transfer client with the global request object failoverTransfer = FailoverTransfer(self.request) catalogs = ['FileCatalog', 'LcgFileCatalog'] #One by one upload the files with failover if necessary failover = {} if not self.failoverTest: for fileName, metadata in final.items(): self.log.info("Attempting to store file %s to the following SE(s):\n%s" % (fileName, string.join(metadata['resolvedSE'], ', '))) result = failoverTransfer.transferAndRegisterFile(fileName, metadata['localpath'], metadata['lfn'], metadata['resolvedSE'], fileGUID = metadata['guid'], fileCatalog = catalogs) if not result['OK']: self.log.error('Could not transfer and register %s with metadata:\n %s' % (fileName, metadata)) failover[fileName] = metadata else: lfn = metadata['lfn'] else: failover = final self.failoverSEs = self.ops.getValue("Production/%s/FailOverSE" % self.experiment, self.failoverSEs) cleanUp = False for fileName, metadata in failover.items(): self.log.info('Setting default catalog for failover transfer to FileCatalog') random.shuffle(self.failoverSEs) targetSE = metadata['resolvedSE'][0] metadata['resolvedSE'] = self.failoverSEs result = failoverTransfer.transferAndRegisterFileFailover(fileName, metadata['localpath'], metadata['lfn'], targetSE, metadata['resolvedSE'], fileGUID = metadata['guid'], fileCatalog = catalogs) if not result['OK']: self.log.error('Could not transfer and register %s with metadata:\n %s' % (fileName, metadata)) cleanUp = True break #no point continuing if one completely fails os.remove("DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK") #cleanup the mess #Now after all operations, retrieve potentially modified request object result = failoverTransfer.getRequestObject() if not result['OK']: self.log.error(result) return S_ERROR('Could not retrieve modified request') self.request = result['Value'] #If some or all of the files failed to be saved to failover if cleanUp: lfns = [] for fileName, metadata in final.items(): lfns.append(metadata['lfn']) result = self.__cleanUp(lfns) self.workflow_commons['Request'] = self.request return S_ERROR('Failed to upload output data') # #Can now register the successfully uploaded files in the BK # if not performBKRegistration: # self.log.info('There are no files to perform the BK registration for, all could be saved to failover') # else: # rm = ReplicaManager() # result = rm.addCatalogFile(performBKRegistration,catalogs=['BookkeepingDB']) # self.log.verbose(result) # if not result['OK']: # self.log.error(result) # return S_ERROR('Could Not Perform BK Registration') # if result['Value']['Failed']: # for lfn,error in result['Value']['Failed'].items(): # self.log.info('BK registration for %s failed with message: "%s" setting failover request' %(lfn,error)) # result = self.request.addSubRequest({'Attributes':{'Operation':'registerFile','ExecutionOrder':0, 'Catalogue':'BookkeepingDB'}},'register') # if not result['OK']: # self.log.error('Could not set registerFile request:\n%s' %result) # return S_ERROR('Could Not Set BK Registration Request') # fileDict = {'LFN':lfn,'Status':'Waiting'} # index = result['Value'] # self.request.setSubRequestFiles(index,'register',[fileDict]) self.workflow_commons['Request'] = self.request return S_OK('Output data uploaded')