def _tryFailoverTransfer(self, tarFileName, tarFileDir): """tries to upload the log tarBall to the failoverSE and creates moving request""" failoverTransfer = FailoverTransfer(self._getRequestContainer()) ##determine the experiment self.failoverSEs = self.ops.getValue( "Production/%s/FailOverSE" % self.experiment, self.failoverSEs) catalogs = self.ops.getValue( 'Production/%s/Catalogs' % self.experiment, ['FileCatalog', 'LcgFileCatalog']) random.shuffle(self.failoverSEs) self.log.info( "Attempting to store file %s to the following SE(s):\n%s" % (tarFileName, ', '.join(self.failoverSEs))) result = failoverTransfer.transferAndRegisterFile( tarFileName, '%s/%s' % (tarFileDir, tarFileName), self.logLFNPath, self.failoverSEs, fileMetaDict={"GUID": None}, fileCatalog=catalogs) if not result['OK']: self.log.error('Failed to upload logs to all destinations') self.setApplicationStatus('Failed To Upload Logs') return S_OK( ) #because if the logs are lost, it's not the end of the world. #Now after all operations, return potentially modified request object return S_OK({ 'Request': failoverTransfer.request, 'uploadedSE': result['Value']['uploadedSE'] })
def _tryFailoverTransfer(self, tarFileName, tarFileDir): """tries to upload the log tarBall to the failoverSE and creates moving request""" failoverTransfer = FailoverTransfer(self._getRequestContainer()) ##determine the experiment self.failoverSEs = self.ops.getValue("Production/%s/FailOverSE" % self.experiment, self.failoverSEs) catalogs = self.ops.getValue('Production/%s/Catalogs' % self.experiment, ['FileCatalog', 'LcgFileCatalog']) random.shuffle(self.failoverSEs) self.log.info("Attempting to store file %s to the following SE(s):\n%s" % (tarFileName, ', '.join(self.failoverSEs ))) result = failoverTransfer.transferAndRegisterFile(tarFileName, '%s/%s' % (tarFileDir, tarFileName), self.logLFNPath, self.failoverSEs, fileMetaDict = { "GUID": None }, fileCatalog = catalogs ) if not result['OK']: self.log.error('Failed to upload logs to all destinations') self.setApplicationStatus('Failed To Upload Logs') return S_OK() #because if the logs are lost, it's not the end of the world. #Now after all operations, return potentially modified request object return S_OK( {'Request': failoverTransfer.request, 'uploadedSE': result['Value']['uploadedSE']})
def execute(self, production_id=None, prod_job_id=None, wms_job_id=None, workflowStatus=None, stepStatus=None, wf_commons=None, step_commons=None, step_number=None, step_id=None, orderedSEs=None): """ Main execution function. """ try: super(UserJobFinalization, self).execute(self.version, production_id, prod_job_id, wms_job_id, workflowStatus, stepStatus, wf_commons, step_commons, step_number, step_id) self._resolveInputVariables() # Earlier modules may have populated the report objects self.request.RequestName = 'job_%d_request.xml' % self.jobID self.request.JobID = self.jobID self.request.SourceComponent = "Job_%d" % self.jobID if not self._checkWFAndStepStatus(): return S_OK() if not self.userOutputData: self.log.info( "No user output data is specified for this job, nothing to do" ) return S_OK("No output data to upload") self.log.info("User specified output file list is: %s" % (', '.join(self.userOutputData))) globList = [] for i in self.userOutputData: if re.search('\*', i): globList.append(i) # Check whether list of userOutputData is a globbable pattern if globList: for i in globList: self.userOutputData.remove(i) globbedOutputList = list(set(getGlobbedFiles(globList))) if globbedOutputList: self.log.info( 'Found a pattern in the output data file list, \ extra files to upload are: %s' % (', '.join(globbedOutputList))) self.userOutputData += globbedOutputList else: self.log.info( "No files were found on the local disk for the following patterns: %s" % (', '.join(globList))) self.log.info("Final list of files to upload are: %s" % (', '.join(self.userOutputData))) # Determine the final list of possible output files for the workflow and all the parameters needed to upload them. outputList = [] for i in self.userOutputData: outputList.append({ 'outputDataType': ('.'.split(i)[-1]).upper(), 'outputDataName': os.path.basename(i) }) userOutputLFNs = [] if self.userOutputData: self.log.info("Constructing user output LFN(s) for %s" % (', '.join(self.userOutputData))) userOutputLFNs = constructUserLFNs(self.jobID, self._getCurrentOwner(), self.userOutputData, self.userOutputPath, self.userPrependString) self.log.verbose( "Calling getCandidateFiles( %s, %s, %s)" % (outputList, userOutputLFNs, self.outputDataFileMask)) try: fileDict = self.getCandidateFiles(outputList, userOutputLFNs, self.outputDataFileMask) except os.error as e: self.setApplicationStatus(e) return S_OK() try: fileMetadata = self.getFileMetadata(fileDict) except RuntimeError as e: self.setApplicationStatus(e) return S_OK() if not fileMetadata: self.log.info( "No output data files were determined to be uploaded for this workflow" ) self.setApplicationStatus('No Output Data Files To Upload') return S_OK() if not orderedSEs: orderedSEs = self._getOrderedSEsList() self.log.info("Ordered list of output SEs is: %s" % (', '.join(orderedSEs))) final = {} for fileName, metadata in fileMetadata.items(): final[fileName] = metadata final[fileName]['resolvedSE'] = orderedSEs # At this point can exit and see exactly what the module will upload if not self._enableModule(): self.log.info( "Module disabled would have attempted to upload the files %s" % ', '.join(final.keys())) for fileName, metadata in final.items(): self.log.info('--------%s--------' % fileName) for n, v in metadata.items(): self.log.info('%s = %s' % (n, v)) return S_OK("Module is disabled by control flag") # Disable the watchdog check in case the file uploading takes a long time self._disableWatchdogCPUCheck() # Instantiate the failover transfer client with the global request object if not self.failoverTransfer: self.failoverTransfer = FailoverTransfer(self.request) # One by one upload the files with failover if necessary replication = {} failover = {} uploaded = [] for fileName, metadata in final.items(): self.log.info( "Attempting to store %s to the following SE(s): %s" % (fileName, ', '.join(metadata['resolvedSE']))) fileMetaDict = { 'Size': metadata['filedict']['Size'], 'LFN': metadata['filedict']['LFN'], 'GUID': metadata['filedict']['GUID'], 'Checksum': metadata['filedict']['Checksum'], 'ChecksumType': metadata['filedict']['ChecksumType'] } result = self.failoverTransfer.transferAndRegisterFile( fileName=fileName, localPath=metadata['localpath'], lfn=metadata['filedict']['LFN'], destinationSEList=metadata['resolvedSE'], fileMetaDict=fileMetaDict, masterCatalogOnly=True) if not result['OK']: self.log.error( "Could not transfer and register %s with metadata:\n %s" % (fileName, metadata)) failover[fileName] = metadata else: # Only attempt replication after successful upload lfn = metadata['lfn'] uploaded.append(lfn) seList = metadata['resolvedSE'] replicateSE = '' uploadedSE = result['Value'].get('uploadedSE', '') if uploadedSE: for se in seList: if not se == uploadedSE: replicateSE = se break if replicateSE and lfn and self.replicateUserOutputData: self.log.info("Will attempt to replicate %s to %s" % (lfn, replicateSE)) replication[lfn] = (uploadedSE, replicateSE, fileMetaDict) cleanUp = False for fileName, metadata in failover.items(): random.shuffle(self.failoverSEs) targetSE = metadata['resolvedSE'][0] if len(metadata['resolvedSE']) > 1: replicateSE = metadata['resolvedSE'][1] else: replicateSE = '' metadata['resolvedSE'] = self.failoverSEs fileMetaDict = { 'Size': metadata['filedict']['Size'], 'LFN': metadata['filedict']['LFN'], 'GUID': metadata['filedict']['GUID'] } result = self.failoverTransfer.transferAndRegisterFileFailover( fileName, metadata['localpath'], metadata['lfn'], targetSE, metadata['resolvedSE'], fileMetaDict=fileMetaDict, masterCatalogOnly=True) if not result['OK']: self.log.error( "Could not transfer and register %s with metadata:\n %s" % (fileName, metadata)) cleanUp = True continue # for users can continue even if one completely fails else: lfn = metadata['lfn'] uploaded.append(lfn) # Even when using Failover, one needs to replicate to a second SE if replicateSE and self.replicateUserOutputData: replication[lfn] = (targetSE, replicateSE, fileMetaDict) # For files correctly uploaded must report LFNs to job parameters if uploaded: report = ', '.join(uploaded) self.setJobParameter('UploadedOutputData', report) # Now after all operations, retrieve potentially modified request object self.request = self.failoverTransfer.request # If some or all of the files failed to be saved to failover if cleanUp: self.workflow_commons['Request'] = self.request # Leave any uploaded files just in case it is useful for the user # do not try to replicate any files. return S_ERROR("Failed To Upload Output Data") for lfn, (uploadedSE, repSE, fileMetaDictItem) in replication.items(): self.failoverTransfer._setFileReplicationRequest( lfn, repSE, fileMetaDictItem, uploadedSE) self.workflow_commons['Request'] = self.failoverTransfer.request self.generateFailoverFile() self.setApplicationStatus("Job Finished Successfully") return S_OK('Output data uploaded') except Exception as e: # pylint:disable=broad-except self.log.exception("Failure in UserJobFinalization execute module", lException=e) self.setApplicationStatus(repr(e)) return S_ERROR(str(e)) finally: super(UserJobFinalization, self).finalize(self.version)
class UserJobFinalization(ModuleBase): """ Finalization of user jobs """ ############################################################################# def __init__(self, bkClient=None, dm=None): """Module initialization. """ self.log = gLogger.getSubLogger("UserJobFinalization") super(UserJobFinalization, self).__init__(self.log, bkClientIn=bkClient, dm=dm) self.version = __RCSID__ self.enable = True self.defaultOutputSE = resolveSEGroup( gConfig.getValue('/Resources/StorageElementGroups/Tier1-USER', [])) self.failoverSEs = resolveSEGroup( gConfig.getValue('/Resources/StorageElementGroups/Tier1-Failover', [])) # List all parameters here self.request = None # Always allow any files specified by users self.outputDataFileMask = '' self.userOutputData = [] self.userOutputSE = '' self.userOutputPath = '' self.failoverTransfer = None self.replicateUserOutputData = False self.userPrependString = '' ############################################################################# def _resolveInputVariables(self): """ By convention the module parameters are resolved here. """ super(UserJobFinalization, self)._resolveInputVariables() # Use LHCb utility for local running via dirac-jobexec if 'UserOutputData' in self.workflow_commons: userOutputData = self.workflow_commons['UserOutputData'] if not isinstance(userOutputData, list): userOutputData = [i.strip() for i in userOutputData.split(';')] self.userOutputData = userOutputData if 'UserOutputSE' in self.workflow_commons: specifiedSE = self.workflow_commons['UserOutputSE'] if not isinstance(specifiedSE, list): self.userOutputSE = [i.strip() for i in specifiedSE.split(';')] else: self.log.verbose( 'No UserOutputSE specified, using default value: %s' % (', '.join(self.defaultOutputSE))) self.userOutputSE = [] if 'UserOutputPath' in self.workflow_commons: self.userOutputPath = self.workflow_commons['UserOutputPath'] if 'ReplicateUserOutputData' in self.workflow_commons and self.workflow_commons[ 'ReplicateUserOutputData']: self.replicateUserOutputData = True if 'UserOutputLFNPrepend' in self.workflow_commons: self.userPrependString = self.workflow_commons[ 'UserOutputLFNPrepend'] ############################################################################# def execute(self, production_id=None, prod_job_id=None, wms_job_id=None, workflowStatus=None, stepStatus=None, wf_commons=None, step_commons=None, step_number=None, step_id=None, orderedSEs=None): """ Main execution function. """ try: super(UserJobFinalization, self).execute(self.version, production_id, prod_job_id, wms_job_id, workflowStatus, stepStatus, wf_commons, step_commons, step_number, step_id) self._resolveInputVariables() # Earlier modules may have populated the report objects self.request.RequestName = 'job_%d_request.xml' % self.jobID self.request.JobID = self.jobID self.request.SourceComponent = "Job_%d" % self.jobID if not self._checkWFAndStepStatus(): return S_OK() if not self.userOutputData: self.log.info( "No user output data is specified for this job, nothing to do" ) return S_OK("No output data to upload") self.log.info("User specified output file list is: %s" % (', '.join(self.userOutputData))) globList = [] for i in self.userOutputData: if re.search('\*', i): globList.append(i) # Check whether list of userOutputData is a globbable pattern if globList: for i in globList: self.userOutputData.remove(i) globbedOutputList = list(set(getGlobbedFiles(globList))) if globbedOutputList: self.log.info( 'Found a pattern in the output data file list, \ extra files to upload are: %s' % (', '.join(globbedOutputList))) self.userOutputData += globbedOutputList else: self.log.info( "No files were found on the local disk for the following patterns: %s" % (', '.join(globList))) self.log.info("Final list of files to upload are: %s" % (', '.join(self.userOutputData))) # Determine the final list of possible output files for the workflow and all the parameters needed to upload them. outputList = [] for i in self.userOutputData: outputList.append({ 'outputDataType': ('.'.split(i)[-1]).upper(), 'outputDataName': os.path.basename(i) }) userOutputLFNs = [] if self.userOutputData: self.log.info("Constructing user output LFN(s) for %s" % (', '.join(self.userOutputData))) userOutputLFNs = constructUserLFNs(self.jobID, self._getCurrentOwner(), self.userOutputData, self.userOutputPath, self.userPrependString) self.log.verbose( "Calling getCandidateFiles( %s, %s, %s)" % (outputList, userOutputLFNs, self.outputDataFileMask)) try: fileDict = self.getCandidateFiles(outputList, userOutputLFNs, self.outputDataFileMask) except os.error as e: self.setApplicationStatus(e) return S_OK() try: fileMetadata = self.getFileMetadata(fileDict) except RuntimeError as e: self.setApplicationStatus(e) return S_OK() if not fileMetadata: self.log.info( "No output data files were determined to be uploaded for this workflow" ) self.setApplicationStatus('No Output Data Files To Upload') return S_OK() if not orderedSEs: orderedSEs = self._getOrderedSEsList() self.log.info("Ordered list of output SEs is: %s" % (', '.join(orderedSEs))) final = {} for fileName, metadata in fileMetadata.items(): final[fileName] = metadata final[fileName]['resolvedSE'] = orderedSEs # At this point can exit and see exactly what the module will upload if not self._enableModule(): self.log.info( "Module disabled would have attempted to upload the files %s" % ', '.join(final.keys())) for fileName, metadata in final.items(): self.log.info('--------%s--------' % fileName) for n, v in metadata.items(): self.log.info('%s = %s' % (n, v)) return S_OK("Module is disabled by control flag") # Disable the watchdog check in case the file uploading takes a long time self._disableWatchdogCPUCheck() # Instantiate the failover transfer client with the global request object if not self.failoverTransfer: self.failoverTransfer = FailoverTransfer(self.request) # One by one upload the files with failover if necessary replication = {} failover = {} uploaded = [] for fileName, metadata in final.items(): self.log.info( "Attempting to store %s to the following SE(s): %s" % (fileName, ', '.join(metadata['resolvedSE']))) fileMetaDict = { 'Size': metadata['filedict']['Size'], 'LFN': metadata['filedict']['LFN'], 'GUID': metadata['filedict']['GUID'], 'Checksum': metadata['filedict']['Checksum'], 'ChecksumType': metadata['filedict']['ChecksumType'] } result = self.failoverTransfer.transferAndRegisterFile( fileName=fileName, localPath=metadata['localpath'], lfn=metadata['filedict']['LFN'], destinationSEList=metadata['resolvedSE'], fileMetaDict=fileMetaDict, masterCatalogOnly=True) if not result['OK']: self.log.error( "Could not transfer and register %s with metadata:\n %s" % (fileName, metadata)) failover[fileName] = metadata else: # Only attempt replication after successful upload lfn = metadata['lfn'] uploaded.append(lfn) seList = metadata['resolvedSE'] replicateSE = '' uploadedSE = result['Value'].get('uploadedSE', '') if uploadedSE: for se in seList: if not se == uploadedSE: replicateSE = se break if replicateSE and lfn and self.replicateUserOutputData: self.log.info("Will attempt to replicate %s to %s" % (lfn, replicateSE)) replication[lfn] = (uploadedSE, replicateSE, fileMetaDict) cleanUp = False for fileName, metadata in failover.items(): random.shuffle(self.failoverSEs) targetSE = metadata['resolvedSE'][0] if len(metadata['resolvedSE']) > 1: replicateSE = metadata['resolvedSE'][1] else: replicateSE = '' metadata['resolvedSE'] = self.failoverSEs fileMetaDict = { 'Size': metadata['filedict']['Size'], 'LFN': metadata['filedict']['LFN'], 'GUID': metadata['filedict']['GUID'] } result = self.failoverTransfer.transferAndRegisterFileFailover( fileName, metadata['localpath'], metadata['lfn'], targetSE, metadata['resolvedSE'], fileMetaDict=fileMetaDict, masterCatalogOnly=True) if not result['OK']: self.log.error( "Could not transfer and register %s with metadata:\n %s" % (fileName, metadata)) cleanUp = True continue # for users can continue even if one completely fails else: lfn = metadata['lfn'] uploaded.append(lfn) # Even when using Failover, one needs to replicate to a second SE if replicateSE and self.replicateUserOutputData: replication[lfn] = (targetSE, replicateSE, fileMetaDict) # For files correctly uploaded must report LFNs to job parameters if uploaded: report = ', '.join(uploaded) self.setJobParameter('UploadedOutputData', report) # Now after all operations, retrieve potentially modified request object self.request = self.failoverTransfer.request # If some or all of the files failed to be saved to failover if cleanUp: self.workflow_commons['Request'] = self.request # Leave any uploaded files just in case it is useful for the user # do not try to replicate any files. return S_ERROR("Failed To Upload Output Data") for lfn, (uploadedSE, repSE, fileMetaDictItem) in replication.items(): self.failoverTransfer._setFileReplicationRequest( lfn, repSE, fileMetaDictItem, uploadedSE) self.workflow_commons['Request'] = self.failoverTransfer.request self.generateFailoverFile() self.setApplicationStatus("Job Finished Successfully") return S_OK('Output data uploaded') except Exception as e: # pylint:disable=broad-except self.log.exception("Failure in UserJobFinalization execute module", lException=e) self.setApplicationStatus(repr(e)) return S_ERROR(str(e)) finally: super(UserJobFinalization, self).finalize(self.version) ############################################################################# def _getOrderedSEsList(self): """ Returns list of ordered SEs to which trying to upload """ # FIXME: remove all banned SEs (not the force ones) # First get the local (or assigned) SE to try first for upload and others in random fashion localSEs = set( getDestinationSEList('Tier1-USER', self.siteName, outputmode='local')) self.log.verbose("Site Local SE for user outputs is: %s" % (list(localSEs))) userSEs = set(self.userOutputSE) otherSEs = set(self.defaultOutputSE) - localSEs - userSEs # If a user SE is local set it first topSEs = userSEs & localSEs # reordered user SEs, setting local first userSEs = list(topSEs) + list(userSEs - topSEs) localSEs = list(localSEs - topSEs) if len(userSEs) < 2 and localSEs: # Set a local SE first orderedSEs = localSEs[0:1] + userSEs + localSEs[1:] else: orderedSEs = userSEs + localSEs random.shuffle(list(otherSEs)) orderedSEs += otherSEs return orderedSEs def _getCurrentOwner(self): """Simple function to return current DIRAC username. """ if 'OwnerName' in self.workflow_commons: return self.workflow_commons['OwnerName'] result = getProxyInfo() if not result['OK']: if not self._enableModule(): return 'testUser' raise RuntimeError('Could not obtain proxy information') if 'username' not in result['Value']: raise RuntimeError('Could not get username from proxy') return result['Value']['username']
def execute(self, production_id=None, prod_job_id=None, wms_job_id=None, workflowStatus=None, stepStatus=None, wf_commons=None, step_commons=None, step_number=None, step_id=None): """ Main executon method """ try: super(UploadLogFile, self).execute(self.version, production_id, prod_job_id, wms_job_id, workflowStatus, stepStatus, wf_commons, step_commons, step_number, step_id) self._resolveInputVariables() self.request.RequestName = 'job_%d_request.xml' % self.jobID self.request.JobID = self.jobID self.request.SourceComponent = "Job_%d" % self.jobID res = systemCall(0, shlex.split('ls -al')) if res['OK'] and res['Value'][0] == 0: self.log.info('The contents of the working directory...') self.log.info(str(res['Value'][1])) else: self.log.error('Failed to list the log directory', str(res['Value'][2])) self.log.info('PRODUCTION_ID = %s, JOB_ID = %s ' % (self.production_id, self.prod_job_id)) self.logdir = os.path.realpath('./job/log/%s/%s' % (self.production_id, self.prod_job_id)) self.log.info('Selected log files will be temporarily stored in %s' % self.logdir) ########################################## # First determine the files which should be saved self.log.info('Determining the files to be saved in the logs.') res = self._determineRelevantFiles() if not res['OK']: self.log.error('Completely failed to select relevant log files.', res['Message']) return S_OK() selectedFiles = res['Value'] self.log.info('The following %s files were selected to be saved:\n%s' % (len(selectedFiles), '\n'.join(selectedFiles))) ######################################### # Create a temporary directory containing these files self.log.info('Populating a temporary directory for selected files.') res = self.__populateLogDirectory(selectedFiles) if not res['OK']: self.log.error('Completely failed to populate temporary log file directory.', res['Message']) self.setApplicationStatus('Failed To Populate Log Dir') return S_OK() self.log.info('%s populated with log files.' % self.logdir) ######################################### # Make sure all the files in the log directory have the correct permissions result = self.__setLogFilePermissions(self.logdir) if not result['OK']: self.log.error('Could not set permissions of log files to 0755 with message:\n%s' % (result['Message'])) # Instantiate the failover transfer client with the global request object if not self.failoverTransfer: self.failoverTransfer = FailoverTransfer(self.request) ######################################### if not self._enableModule(): self.log.info("Would have attempted to upload log files, but there's not JobID") return S_OK() # Attempt to uplaod logs to the LogSE self.log.info('Transferring log files to the %s' % self.logSE) res = returnSingleResult(StorageElement(self.logSE).getURL(self.logFilePath, protocol='https')) if not res['OK']: self.log.warn("Could not get dynamic URL for log", res) logHttpsURL = "http://lhcb-logs.cern.ch/storage%s" % self.logFilePath else: logHttpsURL = res['Value'] logURL = '<a href="%s">Log file directory</a>' % logHttpsURL self.log.info('Logs for this job may be retrieved from %s' % logURL) self.log.info('putDirectory %s %s %s' % (self.logFilePath, os.path.realpath(self.logdir), self.logSE)) res = returnSingleResult(StorageElement(self.logSE).putDirectory( {self.logFilePath: os.path.realpath(self.logdir)})) self.log.verbose(res) self.setJobParameter('Log URL', logURL) if res['OK']: self.log.info('Successfully upload log directory to %s' % self.logSE) else: self.log.error("Failed to upload log files with message '%s', uploading to failover SE" % res['Message']) # make a tar file tarFileName = os.path.basename(self.logLFNPath) try: res = tarFiles(tarFileName, selectedFiles, compression='gz') if not res['OK']: self.log.error('Failed to create tar of log files: %s' % res['Message']) self.setApplicationStatus('Failed to create tar of log files') # We do not fail the job for this case return S_OK() except IOError: self.log.error('Failed to create tar of log files: %s' % res['Message']) self.setApplicationStatus('Failed to create tar of log files') # We do not fail the job for this case return S_OK() self._uploadLogToFailoverSE(tarFileName) self.workflow_commons['Request'] = self.request return S_OK("Log Files uploaded") except Exception as e: # pylint:disable=broad-except self.log.exception("Failure in UploadLogFile execute module", lException=e) return S_ERROR(str(e)) finally: super(UploadLogFile, self).finalize(self.version)
class UploadLogFile(ModuleBase): """ Upload to LogSE """ ############################################################################# def __init__(self, bkClient=None, dm=None): """Module initialization. """ self.log = gLogger.getSubLogger("UploadLogFile") super(UploadLogFile, self).__init__(self.log, bkClientIn=bkClient, dm=dm) self.version = __RCSID__ self.logSE = self.opsH.getValue('LogStorage/LogSE', 'LogSE') self.logSizeLimit = self.opsH.getValue('LogFiles/SizeLimit', 1 * 1024 * 1024) self.logExtensions = self.opsH.getValue('LogFiles/Extensions', []) self.logFilePath = '' self.logLFNPath = '' self.logdir = '' self.failoverTransfer = None self.failoverSEs = [] ###################################################################### def _resolveInputVariables(self): super(UploadLogFile, self)._resolveInputVariables() if 'LogTargetPath' in self.workflow_commons: self.logLFNPath = self.workflow_commons['LogTargetPath'] else: self.log.info('LogFilePath parameter not found, creating on the fly') result = getLogPath(self.workflow_commons, self.bkClient) if not result['OK']: self.log.error('Could not create LogFilePath', result['Message']) return result self.logLFNPath = result['Value']['LogTargetPath'][0] if not isinstance(self.logLFNPath, str): self.logLFNPath = self.logLFNPath[0] ###################################################################### def execute(self, production_id=None, prod_job_id=None, wms_job_id=None, workflowStatus=None, stepStatus=None, wf_commons=None, step_commons=None, step_number=None, step_id=None): """ Main executon method """ try: super(UploadLogFile, self).execute(self.version, production_id, prod_job_id, wms_job_id, workflowStatus, stepStatus, wf_commons, step_commons, step_number, step_id) self._resolveInputVariables() self.request.RequestName = 'job_%d_request.xml' % self.jobID self.request.JobID = self.jobID self.request.SourceComponent = "Job_%d" % self.jobID res = systemCall(0, shlex.split('ls -al')) if res['OK'] and res['Value'][0] == 0: self.log.info('The contents of the working directory...') self.log.info(str(res['Value'][1])) else: self.log.error('Failed to list the log directory', str(res['Value'][2])) self.log.info('PRODUCTION_ID = %s, JOB_ID = %s ' % (self.production_id, self.prod_job_id)) self.logdir = os.path.realpath('./job/log/%s/%s' % (self.production_id, self.prod_job_id)) self.log.info('Selected log files will be temporarily stored in %s' % self.logdir) ########################################## # First determine the files which should be saved self.log.info('Determining the files to be saved in the logs.') res = self._determineRelevantFiles() if not res['OK']: self.log.error('Completely failed to select relevant log files.', res['Message']) return S_OK() selectedFiles = res['Value'] self.log.info('The following %s files were selected to be saved:\n%s' % (len(selectedFiles), '\n'.join(selectedFiles))) ######################################### # Create a temporary directory containing these files self.log.info('Populating a temporary directory for selected files.') res = self.__populateLogDirectory(selectedFiles) if not res['OK']: self.log.error('Completely failed to populate temporary log file directory.', res['Message']) self.setApplicationStatus('Failed To Populate Log Dir') return S_OK() self.log.info('%s populated with log files.' % self.logdir) ######################################### # Make sure all the files in the log directory have the correct permissions result = self.__setLogFilePermissions(self.logdir) if not result['OK']: self.log.error('Could not set permissions of log files to 0755 with message:\n%s' % (result['Message'])) # Instantiate the failover transfer client with the global request object if not self.failoverTransfer: self.failoverTransfer = FailoverTransfer(self.request) ######################################### if not self._enableModule(): self.log.info("Would have attempted to upload log files, but there's not JobID") return S_OK() # Attempt to uplaod logs to the LogSE self.log.info('Transferring log files to the %s' % self.logSE) res = returnSingleResult(StorageElement(self.logSE).getURL(self.logFilePath, protocol='https')) if not res['OK']: self.log.warn("Could not get dynamic URL for log", res) logHttpsURL = "http://lhcb-logs.cern.ch/storage%s" % self.logFilePath else: logHttpsURL = res['Value'] logURL = '<a href="%s">Log file directory</a>' % logHttpsURL self.log.info('Logs for this job may be retrieved from %s' % logURL) self.log.info('putDirectory %s %s %s' % (self.logFilePath, os.path.realpath(self.logdir), self.logSE)) res = returnSingleResult(StorageElement(self.logSE).putDirectory( {self.logFilePath: os.path.realpath(self.logdir)})) self.log.verbose(res) self.setJobParameter('Log URL', logURL) if res['OK']: self.log.info('Successfully upload log directory to %s' % self.logSE) else: self.log.error("Failed to upload log files with message '%s', uploading to failover SE" % res['Message']) # make a tar file tarFileName = os.path.basename(self.logLFNPath) try: res = tarFiles(tarFileName, selectedFiles, compression='gz') if not res['OK']: self.log.error('Failed to create tar of log files: %s' % res['Message']) self.setApplicationStatus('Failed to create tar of log files') # We do not fail the job for this case return S_OK() except IOError: self.log.error('Failed to create tar of log files: %s' % res['Message']) self.setApplicationStatus('Failed to create tar of log files') # We do not fail the job for this case return S_OK() self._uploadLogToFailoverSE(tarFileName) self.workflow_commons['Request'] = self.request return S_OK("Log Files uploaded") except Exception as e: # pylint:disable=broad-except self.log.exception("Failure in UploadLogFile execute module", lException=e) return S_ERROR(str(e)) finally: super(UploadLogFile, self).finalize(self.version) ############################################################################# def _uploadLogToFailoverSE(self, tarFileName): """ Recover the logs to a failover storage element """ # here because self.siteName is not known until execute() is invoked self.failoverSEs = getDestinationSEList('Tier1-Failover', self.siteName, outputmode='Any') random.shuffle(self.failoverSEs) self.log.info("Attempting to store file %s to the following SE(s):\n%s" % (tarFileName, ', '.join(self.failoverSEs))) fileDict = {tarFileName: {'lfn': self.logLFNPath, 'workflowSE': self.failoverSEs}} metadata = self.getFileMetadata(fileDict) fileMetaDict = {'Size': metadata[tarFileName]['filedict']['Size'], 'LFN': metadata[tarFileName]['filedict']['LFN'], 'GUID': metadata[tarFileName]['filedict']['GUID'], 'Checksum': metadata[tarFileName]['filedict']['Checksum'], 'ChecksumType': metadata[tarFileName]['filedict']['ChecksumType']} result = self.failoverTransfer.transferAndRegisterFile(fileName=tarFileName, localPath='%s/%s' % (os.getcwd(), tarFileName), lfn=self.logLFNPath, destinationSEList=self.failoverSEs, fileMetaDict=fileMetaDict, masterCatalogOnly=True) if not result['OK']: self.log.error("Failed to upload logs to all failover destinations (the job will not fail for this reason") self.setApplicationStatus('Failed To Upload Logs') else: uploadedSE = result['Value']['uploadedSE'] self.log.info("Uploaded logs to failover SE %s" % uploadedSE) self.request = self.failoverTransfer.request self.__createLogUploadRequest(self.logSE, self.logLFNPath, uploadedSE) self.log.info("Successfully created failover request") def _determineRelevantFiles(self): """ The files which are below a configurable size will be stored in the logs. This will typically pick up everything in the working directory minus the output data files. """ logFileExtensions = ['*.txt', '*.log', '*.out', '*.output', '*.xml', '*.sh', '*.info', '*.err', 'prodConf*.py'] # '*.root', if self.logExtensions: self.log.info('Using list of log extensions from CS:\n%s' % (', '.join(self.logExtensions))) logFileExtensions = self.logExtensions else: self.log.info('Using default list of log extensions:\n%s' % (', '.join(logFileExtensions))) candidateFiles = [] for ext in logFileExtensions: self.log.debug('Looking at log file wildcard: %s' % ext) globList = glob.glob(ext) for check in globList: if os.path.isfile(check): self.log.debug('Found locally existing log file: %s' % check) candidateFiles.append(check) selectedFiles = [] try: for candidate in candidateFiles: fileSize = os.stat(candidate)[6] if fileSize < self.logSizeLimit: selectedFiles.append(candidate) else: self.log.info('Log file found to be greater than maximum of %s bytes, compressing' % self.logSizeLimit) tarFileName = os.path.basename(candidate) + '.gz' tarFiles(tarFileName, [candidate], compression='gz') selectedFiles.append(tarFileName) return S_OK(selectedFiles) except OSError as x: self.log.exception('Exception while determining files to save.', '', str(x)) return S_ERROR('Could not determine log files') ############################################################################# def __populateLogDirectory(self, selectedFiles): """ A temporary directory is created for all the selected files. These files are then copied into this directory before being uploaded """ # Create the temporary directory mkDir(self.logdir) # Set proper permissions self.log.info('Changing log directory permissions to 0755') try: os.chmod(self.logdir, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH + stat.S_IXOTH) except OSError as x: self.log.error('Could not set logdir permissions to 0755:', '%s (%s)' % (self.logdir, str(x))) # Populate the temporary directory try: for fileS in selectedFiles: destinationFile = '%s/%s' % (self.logdir, os.path.basename(fileS)) shutil.copy(fileS, destinationFile) except shutil.Error: self.log.warn('scr and dst are the same') except IOError as x: self.log.exception('Exception while trying to copy file.', fileS, str(x)) self.log.info('File %s will be skipped and can be considered lost.' % fileS) # Now verify the contents of our target log dir successfulFiles = os.listdir(self.logdir) if not successfulFiles: self.log.info('Failed to copy any files to the target directory.') return S_ERROR() self.log.info('Prepared %s files in the temporary directory.' % self.logdir) return S_OK() ############################################################################# def __createLogUploadRequest(self, targetSE, logFileLFN, uploadedSE): """ Set a request to upload job log files from the output sandbox """ self.log.info('Setting log upload request for %s at %s' % (logFileLFN, targetSE)) logUpload = Operation() logUpload.Type = 'LogUpload' logUpload.TargetSE = targetSE logFile = File() logFile.LFN = logFileLFN logUpload.addFile(logFile) self.request.addOperation(logUpload) logRemoval = Operation() logRemoval.Type = 'RemoveFile' logRemoval.TargetSE = uploadedSE logRemoval.addFile(logFile) self.request.addOperation(logRemoval) ############################################################################# def __setLogFilePermissions(self, logDir): """ Sets the permissions of all the files in the log directory to ensure they are readable. """ try: for toChange in os.listdir(logDir): if not os.path.islink('%s/%s' % (logDir, toChange)): self.log.debug('Changing permissions of %s/%s to 0755' % (logDir, toChange)) os.chmod('%s/%s' % (logDir, toChange), stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH + stat.S_IXOTH) except OSError as x: self.log.error('Problem changing shared area permissions', str(x)) return S_ERROR(x) return S_OK()
def execute(self): """ Main execution function. """ #Have to work out if the module is part of the last step i.e. #user jobs can have any number of steps and we only want #to run the finalization once. currentStep = int(self.step_commons['STEP_NUMBER']) totalSteps = int(self.workflow_commons['TotalSteps']) if currentStep == totalSteps: self.lastStep = True else: self.log.verbose('Current step = %s, total steps of workflow = %s, UserJobFinalization will enable itself only \ at the last workflow step.' % (currentStep, totalSteps)) if not self.lastStep: #Not last step, do nothing, proceed happily. return S_OK() result = self.resolveInputVariables() if not result['OK']: self.log.error("Failed to resolve input parameters:", result['Message']) return result self.log.info('Initializing %s' % self.version) if not self.workflowStatus['OK'] or not self.stepStatus['OK']: ##Something went wrong in the step or the workflow, do nothing. self.log.verbose('Workflow status = %s, step status = %s' % (self.workflowStatus['OK'], self.stepStatus['OK'])) return S_OK('No output data upload attempted') self.request.RequestName = 'job_%d_request.xml' % int(self.jobID) self.request.JobID = self.jobID self.request.SourceComponent = "Job_%d" % int(self.jobID) if not self.userOutputData: self.log.info('No user output data is specified for this job, nothing to do') return S_OK('No output data to upload') #Determine the final list of possible output files for the #workflow and all the parameters needed to upload them. outputList = [] possible_files= [] for i in self.userOutputData: files = getGlobbedFiles(i) for possible_file in files: if possible_file in possible_files: #Don't have twice the same file continue outputList.append({'outputDataType' : i.split('.')[-1].upper(),#this would be used to sort the files in different dirs 'outputDataSE' : self.userOutputSE, 'outputFile' : os.path.basename(possible_file)}) possible_files.append(os.path.basename(possible_file)) self.log.info('Constructing user output LFN(s) for %s' % (', '.join(self.userOutputData))) if not self.jobID: self.jobID = 12345 owner = '' if 'Owner' in self.workflow_commons: owner = self.workflow_commons['Owner'] else: res = getCurrentOwner() if not res['OK']: self.log.error('Could not find proxy') return S_ERROR('Could not obtain owner from proxy') owner = res['Value'] vo = '' if self.workflow_commons.has_key('VO'): vo = self.workflow_commons['VO'] else: res = getVOfromProxyGroup() if not res['OK']: self.log.error('Failed finding the VO') return S_ERROR('Could not obtain VO from proxy') vo = res['Value'] result = constructUserLFNs(int(self.jobID), vo, owner, possible_files, self.userOutputPath) if not result['OK']: self.log.error('Could not create user LFNs', result['Message']) return result userOutputLFNs = result['Value'] self.log.verbose('Calling getCandidateFiles( %s, %s)' % (outputList, userOutputLFNs)) result = self.getCandidateFiles(outputList, userOutputLFNs) if not result['OK']: if not self.ignoreapperrors: self.log.error(result['Message']) self.setApplicationStatus(result['Message']) return S_OK() fileDict = result['Value'] result = self.getFileMetadata(fileDict) if not result['OK']: if not self.ignoreapperrors: self.log.error(result['Message']) self.setApplicationStatus(result['Message']) return S_OK() if not result['Value']: if not self.ignoreapperrors: self.log.info('No output data files were determined to be uploaded for this workflow') self.setApplicationStatus('No Output Data Files To Upload') return S_OK() fileMetadata = result['Value'] orderedSEs = self.userOutputSE self.log.info('Ordered list of output SEs is: %s' % (', '.join(orderedSEs))) final = {} for fileName, metadata in fileMetadata.items(): final[fileName] = metadata final[fileName]['resolvedSE'] = orderedSEs #At this point can exit and see exactly what the module will upload if not self.enable: self.log.info('Module is disabled by control flag, would have attempted \ to upload the following files %s' % ', '.join(final.keys())) for fileName, metadata in final.items(): self.log.info('--------%s--------' % fileName) for n, v in metadata.items(): self.log.info('%s = %s' %(n, v)) return S_OK('Module is disabled by control flag') #Instantiate the failover transfer client with the global request object failoverTransfer = FailoverTransfer(self.request) #One by one upload the files with failover if necessary replication = {} failover = {} uploaded = [] if not self.failoverTest: for fileName, metadata in final.items(): self.log.info("Attempting to store file %s to the following SE(s):\n%s" % (fileName, ', '.join(metadata['resolvedSE']))) replicateSE = '' result = failoverTransfer.transferAndRegisterFile(fileName, metadata['localpath'], metadata['lfn'], metadata['resolvedSE'], fileMetaDict = metadata, fileCatalog = self.userFileCatalog) if not result['OK']: self.log.error('Could not transfer and register %s with metadata:\n %s' % (fileName, metadata)) failover[fileName] = metadata else: #Only attempt replication after successful upload lfn = metadata['lfn'] uploaded.append(lfn) seList = metadata['resolvedSE'] if result['Value'].has_key('uploadedSE'): uploadedSE = result['Value']['uploadedSE'] for se in seList: if not se == uploadedSE: replicateSE = se break if replicateSE and lfn: self.log.info('Will attempt to replicate %s to %s' % (lfn, replicateSE)) replication[lfn] = replicateSE else: failover = final cleanUp = False for fileName, metadata in failover.items(): random.shuffle(self.failoverSEs) targetSE = metadata['resolvedSE'][0] metadata['resolvedSE'] = self.failoverSEs result = failoverTransfer.transferAndRegisterFileFailover(fileName, metadata['localpath'], metadata['lfn'], targetSE, self.failoverSEs, fileMetaDict = metadata, fileCatalog = self.userFileCatalog) if not result['OK']: self.log.error('Could not transfer and register %s with metadata:\n %s' % (fileName, metadata)) cleanUp = True continue #for users can continue even if one completely fails else: lfn = metadata['lfn'] uploaded.append(lfn) #For files correctly uploaded must report LFNs to job parameters if uploaded: report = ', '.join( uploaded ) self.jobReport.setJobParameter( 'UploadedOutputData', report ) self.request = failoverTransfer.request #If some or all of the files failed to be saved to failover if cleanUp: self.workflow_commons['Request'] = self.request #Leave any uploaded files just in case it is useful for the user #do not try to replicate any files. return S_ERROR('Failed To Upload Output Data') #If there is now at least one replica for uploaded files can trigger replication rm = ReplicaManager() self.log.info('Sleeping for 10 seconds before attempting replication of recently uploaded files') time.sleep(10) for lfn, repSE in replication.items(): result = rm.replicateAndRegister(lfn, repSE, catalog = self.userFileCatalog) if not result['OK']: self.log.info('Replication failed with below error but file already exists in Grid storage with \ at least one replica:\n%s' % (result)) self.workflow_commons['Request'] = self.request self.generateFailoverFile() self.setApplicationStatus('Job Finished Successfully') return S_OK('Output data uploaded')
def execute(self): #Have to work out if the module is part of the last step i.e. #user jobs can have any number of steps and we only want #to run the finalization once. currentStep = int(self.step_commons['STEP_NUMBER']) totalSteps = int(self.workflow_commons['TotalSteps']) if currentStep==totalSteps: self.lastStep=True else: self.log.verbose('Current step = %s, total steps of workflow = %s, HandleProdOutputData will enable itself only at the last workflow step.' %(currentStep,totalSteps)) if not self.lastStep: return S_OK() self.result =self.resolveInputVariables() if not self.result['OK']: self.log.error(self.result['Message']) return self.result ###Instantiate object that will ensure that the files are registered properly failoverTransfer = FailoverTransfer(self.request) datatohandle = {} if self.generatorfile: if not os.path.exists(self.generatorfile): return S_ERROR("File %s does not exist, something went wrong before !"%(self.generatorfile)) self.attributesdict['DataType'] = 'gen' lfnpath = string.join([self.basepath,self.attributesdict['Machine'],self.attributesdict['Energy'], self.attributesdict['DataType'],self.attributesdict['EvtType'],self.attributesdict['ProdID'], self.generatorfile],"/") datatohandle[self.generatorfile]={'lfn':lfnpath,'type':'gen','workflowSE':self.destination} if self.mokkafile or self.slicfile: recofile = '' if self.mokkafile and not os.path.exists(self.mokkafile): return S_ERROR("File %s does not exist, something went wrong before !"%(self.mokkafile)) else: recofile = self.mokkafile if self.slicfile and not os.path.exists(self.slicfile): return S_ERROR("File %s does not exist, something went wrong before !"%(self.slicfile)) else: recofile = self.slicfile self.attributesdict['DataType'] = 'SIM' lfnpath = string.join([self.basepath,self.attributesdict['Machine'],self.attributesdict['Energy'], self.attributesdict['DetectorModel'],self.attributesdict['DataType'],self.attributesdict['EvtType'], self.attributesdict['ProdID'],recofile],"/") datatohandle[recofile]={'lfn':lfnpath,'type':'gen','workflowSE':self.destination} ##Below, look in file name if it contain REC or DST, to determine the data type. if self.marlinfiles: for file in self.marlinfiles: if file.find("REC")>-1: self.attributesdict['DataType'] = 'REC' if file.find("DST")>-1: self.attributesdict['DataType'] = 'DST' lfnpath = string.join([self.basepath,self.attributesdict['Machine'],self.attributesdict['Energy'], self.attributesdict['DetectorModel'],self.attributesdict['DataType'],self.attributesdict['EvtType'], self.attributesdict['ProdID'],file],"/") datatohandle[file]={'lfn':lfnpath,'type':'gen','workflowSE':self.destination} if self.lcsimfiles: for file in self.lcsimfiles: if file.find("DST")>-1: self.attributesdict['DataType'] = 'DST' lfnpath = string.join([self.basepath,self.attributesdict['Machine'],self.attributesdict['Energy'], self.attributesdict['DetectorModel'],self.attributesdict['DataType'],self.attributesdict['EvtType'], self.attributesdict['ProdID'],file],"/") datatohandle[file]={'lfn':lfnpath,'type':'gen','workflowSE':self.destination} result = self.getFileMetadata(datatohandle) if not result['OK']: self.setApplicationStatus(result['Message']) return S_OK() fileMetadata = result['Value'] final = {} for fileName,metadata in fileMetadata.items(): final[fileName]=metadata final[fileName]['resolvedSE']=self.destination #One by one upload the files with failover if necessary replication = {} failover = {} uploaded = [] if not self.failoverTest: for fileName,metadata in final.items(): self.log.info("Attempting to store file %s to the following SE(s):\n%s" % (fileName, string.join(metadata['resolvedSE'],', '))) result = failoverTransfer.transferAndRegisterFile(fileName,metadata['localpath'],metadata['lfn'],metadata['resolvedSE'],fileGUID=metadata['guid'],fileCatalog=self.userFileCatalog) if not result['OK']: self.log.error('Could not transfer and register %s with metadata:\n %s' %(fileName,metadata)) failover[fileName]=metadata else: #Only attempt replication after successful upload lfn = metadata['lfn'] uploaded.append(lfn) seList = metadata['resolvedSE'] replicateSE = '' if result['Value'].has_key('uploadedSE'): uploadedSE = result['Value']['uploadedSE'] for se in seList: if not se == uploadedSE: replicateSE = se break if replicateSE and lfn: self.log.info('Will attempt to replicate %s to %s' %(lfn,replicateSE)) replication[lfn]=replicateSE else: failover = final cleanUp = False for fileName,metadata in failover.items(): random.shuffle(self.failoverSEs) targetSE = metadata['resolvedSE'][0] metadata['resolvedSE']=self.failoverSEs result = failoverTransfer.transferAndRegisterFileFailover(fileName,metadata['localpath'],metadata['lfn'],targetSE,metadata['resolvedSE'],fileGUID=metadata['guid'],fileCatalog=self.userFileCatalog) if not result['OK']: self.log.error('Could not transfer and register %s with metadata:\n %s' %(fileName,metadata)) cleanUp = True continue #for users can continue even if one completely fails else: lfn = metadata['lfn'] uploaded.append(lfn) #For files correctly uploaded must report LFNs to job parameters if uploaded: report = string.join( uploaded, ', ' ) self.jobReport.setJobParameter( 'UploadedOutputData', report ) #Now after all operations, retrieve potentially modified request object result = failoverTransfer.getRequestObject() if not result['OK']: self.log.error(result) return S_ERROR('Could Not Retrieve Modified Request') self.request = result['Value'] #If some or all of the files failed to be saved to failover if cleanUp: self.workflow_commons['Request']=self.request #Leave any uploaded files just in case it is useful for the user #do not try to replicate any files. return S_ERROR('Failed To Upload Output Data') return S_OK()
def execute(self): """ Main execution function. """ #Have to work out if the module is part of the last step i.e. #user jobs can have any number of steps and we only want #to run the finalization once. currentStep = int(self.step_commons['STEP_NUMBER']) totalSteps = int(self.workflow_commons['TotalSteps']) if currentStep == totalSteps: self.lastStep = True else: self.log.verbose('Current step = %s, total steps of workflow = %s, UserJobFinalization will enable itself only \ at the last workflow step.' % (currentStep, totalSteps)) if not self.lastStep: return S_OK() result = self.resolveInputVariables() if not result['OK']: self.log.error(result['Message']) return result self.log.info('Initializing %s' % self.version) if not self.workflowStatus['OK'] or not self.stepStatus['OK']: self.log.verbose('Workflow status = %s, step status = %s' % (self.workflowStatus['OK'], self.stepStatus['OK'])) return S_OK('No output data upload attempted') if not self.userOutputData: self.log.info('No user output data is specified for this job, nothing to do') return S_OK('No output data to upload') #Determine the final list of possible output files for the #workflow and all the parameters needed to upload them. outputList = [] for i in self.userOutputData: outputList.append({'outputPath' : string.upper(string.split(i, '.')[-1]), 'outputDataSE' : self.userOutputSE, 'outputFile' : os.path.basename(i)}) userOutputLFNs = [] if self.userOutputData: self.log.info('Constructing user output LFN(s) for %s' % (string.join(self.userOutputData, ', '))) if not self.jobID: self.jobID = 12345 owner = '' if self.workflow_commons.has_key('Owner'): owner = self.workflow_commons['Owner'] else: res = self.getCurrentOwner() if not res['OK']: return S_ERROR('Could not obtain owner from proxy') owner = res['Value'] vo = '' if self.workflow_commons.has_key('VO'): vo = self.workflow_commons['VO'] else: res = self.getCurrentVO() if not res['OK']: return S_ERROR('Could not obtain VO from proxy') vo = res['Value'] result = constructUserLFNs(int(self.jobID), vo, owner, self.userOutputData, self.userOutputPath) if not result['OK']: self.log.error('Could not create user LFNs', result['Message']) return result userOutputLFNs = result['Value'] self.log.verbose('Calling getCandidateFiles( %s, %s, %s)' % (outputList, userOutputLFNs, self.outputDataFileMask)) result = self.getCandidateFiles(outputList, userOutputLFNs, self.outputDataFileMask) if not result['OK']: if not self.ignoreapperrors: self.setApplicationStatus(result['Message']) return S_OK() fileDict = result['Value'] result = self.getFileMetadata(fileDict) if not result['OK']: if not self.ignoreapperrors: self.setApplicationStatus(result['Message']) return S_OK() if not result['Value']: if not self.ignoreapperrors: self.log.info('No output data files were determined to be uploaded for this workflow') self.setApplicationStatus('No Output Data Files To Upload') return S_OK() fileMetadata = result['Value'] #First get the local (or assigned) SE to try first for upload and others in random fashion result = getDestinationSEList('Tier1-USER', DIRAC.siteName(), outputmode='local') if not result['OK']: self.log.error('Could not resolve output data SE', result['Message']) self.setApplicationStatus('Failed To Resolve OutputSE') return result localSE = result['Value'] self.log.verbose('Site Local SE for user outputs is: %s' % (localSE)) orderedSEs = self.defaultOutputSE for se in localSE: if se in orderedSEs: orderedSEs.remove(se) for se in self.userOutputSE: if se in orderedSEs: orderedSEs.remove(se) orderedSEs = localSE + List.randomize(orderedSEs) if self.userOutputSE: prependSEs = [] for userSE in self.userOutputSE: if not userSE in orderedSEs: prependSEs.append(userSE) orderedSEs = prependSEs + orderedSEs self.log.info('Ordered list of output SEs is: %s' % (string.join(orderedSEs, ', '))) final = {} for fileName, metadata in fileMetadata.items(): final[fileName] = metadata final[fileName]['resolvedSE'] = orderedSEs #At this point can exit and see exactly what the module will upload if not self.enable: self.log.info('Module is disabled by control flag, would have attempted \ to upload the following files %s' % string.join(final.keys(), ', ')) for fileName, metadata in final.items(): self.log.info('--------%s--------' % fileName) for n, v in metadata.items(): self.log.info('%s = %s' %(n, v)) return S_OK('Module is disabled by control flag') #Instantiate the failover transfer client with the global request object failoverTransfer = FailoverTransfer(self.request) #One by one upload the files with failover if necessary replication = {} failover = {} uploaded = [] if not self.failoverTest: for fileName, metadata in final.items(): self.log.info("Attempting to store file %s to the following SE(s):\n%s" % (fileName, string.join(metadata['resolvedSE'], ', '))) result = failoverTransfer.transferAndRegisterFile(fileName, metadata['localpath'], metadata['lfn'], metadata['resolvedSE'], fileGUID = metadata['guid'], fileCatalog = self.userFileCatalog) if not result['OK']: self.log.error('Could not transfer and register %s with metadata:\n %s' % (fileName, metadata)) failover[fileName] = metadata else: #Only attempt replication after successful upload lfn = metadata['lfn'] uploaded.append(lfn) seList = metadata['resolvedSE'] replicateSE = '' if result['Value'].has_key('uploadedSE'): uploadedSE = result['Value']['uploadedSE'] for se in seList: if not se == uploadedSE: replicateSE = se break if replicateSE and lfn: self.log.info('Will attempt to replicate %s to %s' % (lfn, replicateSE)) replication[lfn] = replicateSE else: failover = final cleanUp = False for fileName, metadata in failover.items(): random.shuffle(self.failoverSEs) targetSE = metadata['resolvedSE'][0] metadata['resolvedSE'] = self.failoverSEs result = failoverTransfer.transferAndRegisterFileFailover(fileName, metadata['localpath'], metadata['lfn'], targetSE, metadata['resolvedSE'], fileGUID = metadata['guid'], fileCatalog = self.userFileCatalog) if not result['OK']: self.log.error('Could not transfer and register %s with metadata:\n %s' % (fileName, metadata)) cleanUp = True continue #for users can continue even if one completely fails else: lfn = metadata['lfn'] uploaded.append(lfn) #For files correctly uploaded must report LFNs to job parameters if uploaded: report = string.join( uploaded, ', ' ) self.jobReport.setJobParameter( 'UploadedOutputData', report ) #Now after all operations, retrieve potentially modified request object result = failoverTransfer.getRequestObject() if not result['OK']: self.log.error(result) return S_ERROR('Could Not Retrieve Modified Request') self.request = result['Value'] #If some or all of the files failed to be saved to failover if cleanUp: self.workflow_commons['Request'] = self.request #Leave any uploaded files just in case it is useful for the user #do not try to replicate any files. return S_ERROR('Failed To Upload Output Data') #If there is now at least one replica for uploaded files can trigger replication rm = ReplicaManager() self.log.info('Sleeping for 10 seconds before attempting replication of recently uploaded files') time.sleep(10) for lfn, repSE in replication.items(): result = rm.replicateAndRegister(lfn, repSE, catalog = self.userFileCatalog) if not result['OK']: self.log.info('Replication failed with below error but file already exists in Grid storage with \ at least one replica:\n%s' % (result)) self.workflow_commons['Request'] = self.request #Now must ensure if any pending requests are generated that these are propagated to the job wrapper reportRequest = None if self.jobReport: result = self.jobReport.generateRequest() if not result['OK']: self.log.warn('Could not generate request for job report with result:\n%s' % (result)) else: reportRequest = result['Value'] if reportRequest: self.log.info('Populating request with job report information') self.request.update(reportRequest) if not self.request.isEmpty()['Value']: request_string = self.request.toXML()['Value'] # Write out the request string fname = 'user_job_%s_request.xml' % (self.jobID) xmlfile = open(fname, 'w') xmlfile.write(request_string) xmlfile.close() self.log.info('Creating failover request for deferred operations for job %s:' % self.jobID) result = self.request.getDigest() if result['OK']: digest = result['Value'] self.log.info(digest) self.setApplicationStatus('Job Finished Successfully') return S_OK('Output data uploaded')
def execute(self, production_id=None, prod_job_id=None, wms_job_id=None, workflowStatus=None, stepStatus=None, wf_commons=None, step_commons=None, step_number=None, step_id=None, SEs=None, fileDescendants=None): """ Main execution function. 1. Determine the final list of possible output files for the workflow and all the parameters needed to upload them. 2. Verifying that the input files have no descendants (and exiting with error, otherwise) 3. Sending the BK records for the steps of the job 4. Transfer output files in their destination, register in the FC (with failover) 5. Registering the output files in the Bookkeeping """ try: super(UploadOutputData, self).execute(self.version, production_id, prod_job_id, wms_job_id, workflowStatus, stepStatus, wf_commons, step_commons, step_number, step_id) # This returns all Tier1-Failover unless a specific one is defined for the site self.failoverSEs = getDestinationSEList('Tier1-Failover', self.siteName, outputmode='Any') random.shuffle(self.failoverSEs) self._resolveInputVariables() if not self._checkWFAndStepStatus(): return S_OK( "Failures detected in previous steps: no output data upload attempted" ) # ## 1. Determine the final list of possible output files # ## for the workflow and all the parameters needed to upload them. # ## self.log.verbose("Getting the list of candidate files") fileDict = self.getCandidateFiles(self.outputList, self.prodOutputLFNs, self.outputDataFileMask, self.outputDataStep) fileMetadata = self.getFileMetadata(fileDict) if not fileMetadata: self.log.info( "No output data files were determined to be uploaded for this workflow" ) return S_OK() # Get final, resolved SE list for files final = {} for fileName, metadata in fileMetadata.iteritems(): if not SEs: resolvedSE = getDestinationSEList( metadata['workflowSE'], self.siteName, self.outputMode, self.workflow_commons.get('runNumber')) else: resolvedSE = SEs final[fileName] = metadata final[fileName]['resolvedSE'] = resolvedSE self.log.info("The following files will be uploaded", ": %s" % (', '.join(final.keys()))) for fileName, metadata in final.items(): self.log.info('--------%s--------' % fileName) for name, val in metadata.iteritems(): self.log.info('%s = %s' % (name, val)) if not self._enableModule(): # At this point can exit and see exactly what the module would have uploaded self.log.info( "Module disabled", "would have attempted to upload the files %s" % ', '.join(final.keys())) # ## 2. Prior to uploading any files must check (for productions with input data) that no descendant files # ## already exist with replica flag in the BK. # ## if self.inputDataList: if fileDescendants is not None: lfnsWithDescendants = fileDescendants else: if not self._enableModule(): self.log.info( "Module disabled", "would have attempted to check the files %s" % ', '.join(self.inputDataList)) lfnsWithDescendants = [] else: lfnsWithDescendants = getFileDescendants( self.production_id, self.inputDataList, dm=self.dataManager, bkClient=self.bkClient) if not lfnsWithDescendants: self.log.info( "No descendants found, outputs can be uploaded") else: self.log.error( "Found descendants!!! Outputs won't be uploaded") self.log.info("Files with descendants", ": %s" ' % '.join(lfnsWithDescendants)) self.log.info( "The files above will be set as 'Processed', other lfns in input will be later reset as Unused" ) self.fileReport.setFileStatus(int(self.production_id), lfnsWithDescendants, 'Processed') return S_ERROR("Input Data Already Processed") # ## 3. Sending the BK records for the steps of the job # ## bkFileExtensions = ['bookkeeping*.xml'] bkFiles = [] for ext in bkFileExtensions: self.log.debug("Looking at BK record wildcard: %s" % ext) globList = glob.glob(ext) for check in globList: if os.path.isfile(check): self.log.verbose( "Found locally existing BK file record", ": %s" % check) bkFiles.append(check) # Unfortunately we depend on the file names to order the BK records bkFilesListTuples = [] for bk in bkFiles: bkFilesListTuples.append( (bk, int(bk.split('_')[-1].split('.')[0]))) bkFiles = [ bk[0] for bk in sorted(bkFilesListTuples, key=itemgetter(1)) ] self.log.info("The following BK records will be sent", ": %s" % (', '.join(bkFiles))) if self._enableModule(): for bkFile in bkFiles: with open(bkFile, 'r') as fd: bkXML = fd.read() self.log.info("Sending BK record", ":\n%s" % (bkXML)) result = self.bkClient.sendXMLBookkeepingReport(bkXML) self.log.verbose(result) if result['OK']: self.log.info("Bookkeeping report sent", "for %s" % bkFile) else: self.log.error( "Could not send Bookkeeping XML file to server", ": %s" % result['Message']) self.log.info("Preparing DISET request", "for %s" % bkFile) bkDISETReq = Operation() bkDISETReq.Type = 'ForwardDISET' bkDISETReq.Arguments = DEncode.encode( result['rpcStub']) self.request.addOperation(bkDISETReq) self.workflow_commons[ 'Request'] = self.request # update each time, just in case else: self.log.info( "Would have attempted to send bk records, but module is disabled" ) # ## 4. Transfer output files in their destination, register in the FC (with failover) # ## # Disable the watchdog check in case the file uploading takes a long time self._disableWatchdogCPUCheck() # Instantiate the failover transfer client with the global request object if not self.failoverTransfer: self.failoverTransfer = FailoverTransfer(self.request) # Track which files are successfully uploaded (not to failover) via performBKRegistration = [] # Failover replicas are always added to the BK when they become available (actually, added to all the catalogs) failover = {} for fileName, metadata in final.items(): targetSE = metadata['resolvedSE'] self.log.info( "Attempting to store file to SE", "%s to the following SE(s):\n%s" % (fileName, ', '.join(targetSE))) fileMetaDict = { 'Size': metadata['filedict']['Size'], 'LFN': metadata['filedict']['LFN'], 'GUID': metadata['filedict']['GUID'], 'Checksum': metadata['filedict']['Checksum'], 'ChecksumType': metadata['filedict']['ChecksumType'] } if not self._enableModule(): # At this point can exit and see exactly what the module would have uploaded self.log.info( "Module disabled", "would have attempted to upload file %s" % fileName) continue result = self.failoverTransfer.transferAndRegisterFile( fileName=fileName, localPath=metadata['localpath'], lfn=metadata['filedict']['LFN'], destinationSEList=targetSE, fileMetaDict=fileMetaDict, masterCatalogOnly=True) if not result['OK']: self.log.error( "Could not transfer and register", " %s with metadata:\n %s" % (fileName, metadata)) failover[fileName] = metadata else: self.log.info( "File uploaded, will be registered in BK if all files uploaded for job", "(%s)" % fileName) # if the files are uploaded in the SE, independently if the registration in the FC is done, # then we have to register all of them in the BKK performBKRegistration.append(metadata) cleanUp = False for fileName, metadata in failover.items(): self.log.info( "Setting default catalog for failover transfer registration to master catalog" ) random.shuffle(self.failoverSEs) targetSE = metadata['resolvedSE'][0] metadata['resolvedSE'] = self.failoverSEs fileMetaDict = { 'Size': metadata['filedict']['Size'], 'LFN': metadata['filedict']['LFN'], 'GUID': metadata['filedict']['GUID'], 'Checksum': metadata['filedict']['Checksum'], 'ChecksumType': metadata['filedict']['ChecksumType'] } if not self._enableModule(): # At this point can exit and see exactly what the module would have uploaded self.log.info( "Module disabled", "would have attempted to upload with failover file %s" % fileName) continue result = self.failoverTransfer.transferAndRegisterFileFailover( fileName=fileName, localPath=metadata['localpath'], lfn=metadata['filedict']['LFN'], targetSE=targetSE, failoverSEList=metadata['resolvedSE'], fileMetaDict=fileMetaDict, masterCatalogOnly=True) if not result['OK']: self.log.error( "Could not transfer and register", "%s in failover with metadata:\n %s" % (fileName, metadata)) cleanUp = True break # no point continuing if one completely fails # Now after all operations, retrieve potentially modified request object self.request = self.failoverTransfer.request # If some or all of the files failed to be saved even to failover if cleanUp and self._enableModule(): self._cleanUp(final) self.workflow_commons['Request'] = self.request return S_ERROR('Failed to upload output data') # For files correctly uploaded must report LFNs to job parameters if final and self._enableModule(): report = ', '.join(final.keys()) self.setJobParameter('UploadedOutputData', report) # ## 5. Can now register the successfully uploaded files in the BK i.e. set the BK replica flags # ## if not performBKRegistration: self.log.info( "There are no files to perform the BK registration for, all are in failover" ) elif self._enableModule(): # performing BK registration # Getting what should be registered immediately, and what later lfnsToRegisterInBK = set([ metadata['filedict']['LFN'] for metadata in performBKRegistration ]) lfnsToRegisterInBKNow = self._getLFNsForBKRegistration( lfnsToRegisterInBK) lfnsToRegisterInBKLater = list(lfnsToRegisterInBK - set(lfnsToRegisterInBKNow)) # Registering what should be registering immediately, and handling failures result = FileCatalog( catalogs=['BookkeepingDB']).addFile(lfnsToRegisterInBKNow) self.log.verbose("BookkeepingDB.addFile: %s" % result) if not result['OK']: self.log.error(result) return S_ERROR("Could Not Perform BK Registration") if 'Failed' in result['Value'] and result['Value']['Failed']: for lfn, error in result['Value']['Failed'].iteritems(): lfnMetadata = {} for lfnMD in performBKRegistration: if lfnMD[ 'lfn'] == lfn: # the lfn is indeed both at lfnMD['lfn'] and at lfnMD['filedict']['LFN'] lfnMetadata = lfnMD['filedict'] break self.setBKRegistrationRequest(lfn, error=error, metaData=lfnMetadata) # Adding a registration request for what whould be registered later if lfnsToRegisterInBKLater: for lfnMD in performBKRegistration: if lfnMD['lfn'] in lfnsToRegisterInBKLater: lfnMetadata = lfnMD['filedict'] self.setBKRegistrationRequest(lfnMD['lfn'], metaData=lfnMetadata) self.workflow_commons['Request'] = self.request return S_OK("Output data uploaded") except Exception as e: # pylint:disable=broad-except self.log.exception('Exception in UploadOutputData', lException=e) self.setApplicationStatus(repr(e)) return S_ERROR(str(e)) finally: super(UploadOutputData, self).finalize(self.version)
def finalize(self): """ finalize method performs final operations after all the job steps were executed. Only production jobs are treated. """ self.log.verbose("Starting UploadLogFile finalize") ########################################## # First determine the files which should be saved self.log.info("Determining the files to be saved in the logs.") res = self.determineRelevantFiles() if not res["OK"]: self.log.error("Completely failed to select relevant log files.", res["Message"]) return S_OK() # because if the logs are lost, it's not the end of the world. selectedFiles = res["Value"] self.log.info( "The following %s files were selected to be saved:\n%s" % (len(selectedFiles), string.join(selectedFiles, "\n")) ) ######################################### # Create a temporary directory containing these files self.log.info("Populating a temporary directory for selected files.") res = self.populateLogDirectory(selectedFiles) if not res["OK"]: self.log.error("Completely failed to populate temporary log file directory.", res["Message"]) self.setApplicationStatus("Failed To Populate Log Dir") return S_OK() # because if the logs are lost, it's not the end of the world. self.log.info("%s populated with log files." % self.logdir) ######################################### # Create a tailored index page # self.log.info('Creating an index page for the logs') # result = self.__createLogIndex(selectedFiles) # if not result['OK']: # self.log.error('Failed to create index page for logs', res['Message']) if not self.enable: self.log.info("Module is disabled by control flag") return S_OK("Module is disabled by control flag") ######################################### # Make sure all the files in the log directory have the correct permissions result = self.__setLogFilePermissions(self.logdir) if not result["OK"]: self.log.error("Could not set permissions of log files to 0755 with message:\n%s" % (result["Message"])) ######################################### # Attempt to uplaod logs to the LogSE self.log.info("Transferring log files to the %s" % self.logSE) res = S_ERROR() if not self.failoverTest: self.log.info("PutDirectory %s %s %s" % (self.logFilePath, os.path.realpath(self.logdir), self.logSE)) res = self.rm.putStorageDirectory( {self.logFilePath: os.path.realpath(self.logdir)}, self.logSE, singleDirectory=True ) self.log.verbose(res) if res["OK"]: self.log.info("Successfully upload log directory to %s" % self.logSE) # TODO: The logURL should be constructed using the LogSE and StorageElement() # storageElement = StorageElement(self.logSE) # pfn = storageElement.getPfnForLfn(self.logFilePath)['Value'] # logURL = getPfnForProtocol(res['Value'],'http')['Value'] logURL = "%s" % self.logFilePath self.setJobParameter("Log LFN", logURL) self.log.info("Logs for this job may be retrieved with dirac-ilc-get-prod-log -F %s" % logURL) return S_OK() ######################################### # Recover the logs to a failover storage element self.log.error( "Completely failed to upload log files to %s, will attempt upload to failover SE" % self.logSE, res["Message"], ) tarFileDir = os.path.dirname(self.logdir) self.logLFNPath = "%s.gz" % self.logLFNPath tarFileName = os.path.basename(self.logLFNPath) start = os.getcwd() os.chdir(self.logdir) logTarFiles = os.listdir(self.logdir) # comm = 'tar czvf %s %s' % (tarFileName,string.join(logTarFiles,' ')) tfile = tarfile.open(tarFileName, "w:gz") for item in logTarFiles: tfile.add(item) tfile.close() # res = shellCall(0,comm) if not os.path.exists(tarFileName): res = S_ERROR("File was not created") os.chdir(start) if not res["OK"]: self.log.error("Failed to create tar file from directory", "%s %s" % (self.logdir, res["Message"])) self.setApplicationStatus("Failed To Create Log Tar Dir") return S_OK() # because if the logs are lost, it's not the end of the world. # if res['Value'][0]: #i.e. non-zero status # self.log.error('Failed to create tar file from directory','%s %s' % (self.logdir,res['Value'])) # self.setApplicationStatus('Failed To Create Log Tar Dir') # return S_OK()#because if the logs are lost, it's not the end of the world. ############################################################ # Instantiate the failover transfer client with the global request object failoverTransfer = FailoverTransfer(self.request) ##determine the experiment self.failoverSEs = self.ops.getValue("Production/%s/FailOverSE" % self.experiment, self.failoverSEs) random.shuffle(self.failoverSEs) self.log.info( "Attempting to store file %s to the following SE(s):\n%s" % (tarFileName, string.join(self.failoverSEs, ", ")) ) result = failoverTransfer.transferAndRegisterFile( tarFileName, "%s/%s" % (tarFileDir, tarFileName), self.logLFNPath, self.failoverSEs, fileGUID=None, fileCatalog=["FileCatalog", "LcgFileCatalog"], ) if not result["OK"]: self.log.error("Failed to upload logs to all destinations") self.setApplicationStatus("Failed To Upload Logs") return S_OK() # because if the logs are lost, it's not the end of the world. # Now after all operations, retrieve potentially modified request object result = failoverTransfer.getRequestObject() if not result["OK"]: self.log.error(result) return S_ERROR("Could not retrieve modified request") self.request = result["Value"] res = self.createLogUploadRequest(self.logSE, self.logLFNPath) if not res["OK"]: self.log.error("Failed to create failover request", res["Message"]) self.setApplicationStatus("Failed To Upload Logs To Failover") else: self.log.info("Successfully created failover request") self.workflow_commons["Request"] = self.request return S_OK()
def execute(self): """ Main execution function. """ self.log.info('Initializing %s' % self.version) result = self.resolveInputVariables() if not result['OK']: self.log.error("Failed to resolve input parameters:", result['Message']) return result if not self.workflowStatus['OK'] or not self.stepStatus['OK']: self.log.verbose('Workflow status = %s, step status = %s' % (self.workflowStatus['OK'], self.stepStatus['OK'])) return S_OK('No output data upload attempted') ##determine the experiment example_file = self.prodOutputLFNs[0] if "/ilc/prod/clic" in example_file: self.experiment = "CLIC" elif "/ilc/prod/ilc/sid" in example_file: self.experiment = 'ILC_SID' elif "/ilc/prod/ilc/mc-dbd" in example_file: self.experiment = 'ILC_ILD' else: self.log.warn("Failed to determine experiment, reverting to default") #Determine the final list of possible output files for the #workflow and all the parameters needed to upload them. result = self.getCandidateFiles(self.outputList, self.prodOutputLFNs, self.outputDataFileMask) if not result['OK']: self.log.error(result['Message']) self.setApplicationStatus(result['Message']) return result fileDict = result['Value'] result = self.getFileMetadata(fileDict) if not result['OK']: self.log.error(result['Message']) self.setApplicationStatus(result['Message']) return result if not result['Value']: self.log.info('No output data files were determined to be uploaded for this workflow') return S_OK() fileMetadata = result['Value'] #Get final, resolved SE list for files final = {} for fileName, metadata in fileMetadata.items(): result = getDestinationSEList(metadata['workflowSE'], DIRAC.siteName(), self.outputMode) if not result['OK']: self.log.error('Could not resolve output data SE', result['Message']) self.setApplicationStatus('Failed To Resolve OutputSE') return result resolvedSE = result['Value'] final[fileName] = metadata final[fileName]['resolvedSE'] = resolvedSE self.log.info('The following files will be uploaded: %s' % (', '.join(final.keys() ))) for fileName, metadata in final.items(): self.log.info('--------%s--------' % fileName) for metaName, metaValue in metadata.items(): self.log.info('%s = %s' % (metaName, metaValue)) #At this point can exit and see exactly what the module would have uploaded if not self.enable: self.log.info('Module is disabled by control flag, would have attempted to upload the \ following files %s' % ', '.join(final.keys())) return S_OK('Module is disabled by control flag') #Disable the watchdog check in case the file uploading takes a long time self.log.info('Creating DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK in order to disable the Watchdog prior to upload') fopen = open('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK','w') fopen.write('%s' % time.asctime()) fopen.close() #Instantiate the failover transfer client with the global request object failoverTransfer = FailoverTransfer(self._getRequestContainer()) catalogs = self.ops.getValue('Production/%s/Catalogs' % self.experiment, ['FileCatalog', 'LcgFileCatalog']) #One by one upload the files with failover if necessary failover = {} if not self.failoverTest: for fileName, metadata in final.iteritems(): self.log.info("Attempting to store file %s to the following SE(s):\n%s" % (fileName, ', '.join(metadata['resolvedSE']))) result = failoverTransfer.transferAndRegisterFile(fileName, metadata['localpath'], metadata['lfn'], metadata['resolvedSE'], fileMetaDict = metadata['filedict'], fileCatalog = catalogs) if not result['OK']: self.log.error('Could not transfer and register %s with metadata:\n %s' % (fileName, metadata['filedict'])) failover[fileName] = metadata else: #lfn = metadata['lfn'] pass else: failover = final self.failoverSEs = self.ops.getValue("Production/%s/FailOverSE" % self.experiment, self.failoverSEs) cleanUp = False for fileName, metadata in failover.iteritems(): self.log.info('Setting default catalog for failover transfer to FileCatalog') failovers = self.failoverSEs targetSE = metadata['resolvedSE'][0] try:#remove duplicate site, otherwise it will do nasty things where processing the request failovers.remove(targetSE) except ValueError: pass random.shuffle(failovers) metadata['resolvedSE'] = failovers result = failoverTransfer.transferAndRegisterFileFailover(fileName, metadata['localpath'], metadata['lfn'], targetSE, metadata['resolvedSE'], fileMetaDict = metadata['filedict'], fileCatalog = catalogs) if not result['OK']: self.log.error('Could not transfer and register %s with metadata:\n %s' % (fileName, metadata['filedict'])) cleanUp = True break #no point continuing if one completely fails os.remove("DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK") #cleanup the mess self.workflow_commons['Request'] = failoverTransfer.request #If some or all of the files failed to be saved to failover if cleanUp: lfns = [] for fileName, metadata in final.items(): lfns.append(metadata['lfn']) result = self._cleanUp(lfns) return S_ERROR('Failed to upload output data') return S_OK('Output data uploaded')
def execute(self): """ Main execution function. """ self.log.info('Initializing %s' % self.version) result = self.resolveInputVariables() if not result['OK']: self.log.error(result['Message']) return result if not self.workflowStatus['OK'] or not self.stepStatus['OK']: self.log.verbose('Workflow status = %s, step status = %s' % (self.workflowStatus['OK'], self.stepStatus['OK'])) return S_OK('No output data upload attempted') ##determine the experiment example_file = self.prodOutputLFNs[0] if "/ilc/prod/clic" in example_file: self.experiment = "CLIC" elif "/ilc/prod/ilc/sid" in example_file: self.experiment = 'ILC_SID' elif "/ilc/prod/ilc/mc-dbd" in example_file: self.experiment = 'ILC_ILD' else: self.log.warn("Failed to determine experiment, reverting to default") #Determine the final list of possible output files for the #workflow and all the parameters needed to upload them. result = self.getCandidateFiles(self.outputList, self.prodOutputLFNs, self.outputDataFileMask) if not result['OK']: self.setApplicationStatus(result['Message']) return result fileDict = result['Value'] result = self.getFileMetadata(fileDict) if not result['OK']: self.setApplicationStatus(result['Message']) return result if not result['Value']: self.log.info('No output data files were determined to be uploaded for this workflow') return S_OK() fileMetadata = result['Value'] #Get final, resolved SE list for files final = {} for fileName, metadata in fileMetadata.items(): result = getDestinationSEList(metadata['workflowSE'], DIRAC.siteName(), self.outputMode) if not result['OK']: self.log.error('Could not resolve output data SE', result['Message']) self.setApplicationStatus('Failed To Resolve OutputSE') return result resolvedSE = result['Value'] final[fileName] = metadata final[fileName]['resolvedSE'] = resolvedSE self.log.info('The following files will be uploaded: %s' % (string.join(final.keys(), ', '))) for fileName, metadata in final.items(): self.log.info('--------%s--------' % fileName) for n, v in metadata.items(): self.log.info('%s = %s' % (n, v)) #At this point can exit and see exactly what the module would have uploaded if not self.enable: self.log.info('Module is disabled by control flag, would have attempted to upload the \ following files %s' % string.join(final.keys(), ', ')) return S_OK('Module is disabled by control flag') #Disable the watchdog check in case the file uploading takes a long time self.log.info('Creating DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK in order to disable the Watchdog prior to upload') fopen = open('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK','w') fopen.write('%s' % time.asctime()) fopen.close() #Instantiate the failover transfer client with the global request object failoverTransfer = FailoverTransfer(self.request) catalogs = ['FileCatalog', 'LcgFileCatalog'] #One by one upload the files with failover if necessary failover = {} if not self.failoverTest: for fileName, metadata in final.items(): self.log.info("Attempting to store file %s to the following SE(s):\n%s" % (fileName, string.join(metadata['resolvedSE'], ', '))) result = failoverTransfer.transferAndRegisterFile(fileName, metadata['localpath'], metadata['lfn'], metadata['resolvedSE'], fileGUID = metadata['guid'], fileCatalog = catalogs) if not result['OK']: self.log.error('Could not transfer and register %s with metadata:\n %s' % (fileName, metadata)) failover[fileName] = metadata else: lfn = metadata['lfn'] else: failover = final self.failoverSEs = self.ops.getValue("Production/%s/FailOverSE" % self.experiment, self.failoverSEs) cleanUp = False for fileName, metadata in failover.items(): self.log.info('Setting default catalog for failover transfer to FileCatalog') random.shuffle(self.failoverSEs) targetSE = metadata['resolvedSE'][0] metadata['resolvedSE'] = self.failoverSEs result = failoverTransfer.transferAndRegisterFileFailover(fileName, metadata['localpath'], metadata['lfn'], targetSE, metadata['resolvedSE'], fileGUID = metadata['guid'], fileCatalog = catalogs) if not result['OK']: self.log.error('Could not transfer and register %s with metadata:\n %s' % (fileName, metadata)) cleanUp = True break #no point continuing if one completely fails os.remove("DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK") #cleanup the mess #Now after all operations, retrieve potentially modified request object result = failoverTransfer.getRequestObject() if not result['OK']: self.log.error(result) return S_ERROR('Could not retrieve modified request') self.request = result['Value'] #If some or all of the files failed to be saved to failover if cleanUp: lfns = [] for fileName, metadata in final.items(): lfns.append(metadata['lfn']) result = self.__cleanUp(lfns) self.workflow_commons['Request'] = self.request return S_ERROR('Failed to upload output data') # #Can now register the successfully uploaded files in the BK # if not performBKRegistration: # self.log.info('There are no files to perform the BK registration for, all could be saved to failover') # else: # rm = ReplicaManager() # result = rm.addCatalogFile(performBKRegistration,catalogs=['BookkeepingDB']) # self.log.verbose(result) # if not result['OK']: # self.log.error(result) # return S_ERROR('Could Not Perform BK Registration') # if result['Value']['Failed']: # for lfn,error in result['Value']['Failed'].items(): # self.log.info('BK registration for %s failed with message: "%s" setting failover request' %(lfn,error)) # result = self.request.addSubRequest({'Attributes':{'Operation':'registerFile','ExecutionOrder':0, 'Catalogue':'BookkeepingDB'}},'register') # if not result['OK']: # self.log.error('Could not set registerFile request:\n%s' %result) # return S_ERROR('Could Not Set BK Registration Request') # fileDict = {'LFN':lfn,'Status':'Waiting'} # index = result['Value'] # self.request.setSubRequestFiles(index,'register',[fileDict]) self.workflow_commons['Request'] = self.request return S_OK('Output data uploaded')
def execute(self): """ Main execution function. """ #Have to work out if the module is part of the last step i.e. #user jobs can have any number of steps and we only want #to run the finalization once. Not a problem if this is not the last step so return S_OK() resultLS = self.isLastStep() if not resultLS['OK']: return S_OK() self.logWorkingDirectory() resultIV = self.resolveInputVariables() if not resultIV['OK']: self.log.error("Failed to resolve input parameters:", resultIV['Message']) return resultIV self.log.info('Initializing %s' % self.version) if not self.workflowStatus['OK'] or not self.stepStatus['OK']: self.log.verbose('Workflow status = %s, step status = %s' % (self.workflowStatus['OK'], self.stepStatus['OK'])) return S_OK('No output data upload attempted') if not self.userOutputData: self.log.info('No user output data is specified for this job, nothing to do') return S_OK('No output data to upload') #Determine the final list of possible output files for the #workflow and all the parameters needed to upload them. outputList = self.getOutputList() userOutputLFNs = [] if self.userOutputData: resultOLfn = self.constructOutputLFNs() if not resultOLfn['OK']: self.log.error('Could not create user LFNs', resultOLfn['Message']) return resultOLfn userOutputLFNs = resultOLfn['Value'] self.log.verbose('Calling getCandidateFiles( %s, %s, %s)' % (outputList, userOutputLFNs, self.outputDataFileMask)) self.log.debug("IgnoreAppErrors? '%s' " % self.ignoreapperrors) resultCF = self.getCandidateFiles(outputList, userOutputLFNs, self.outputDataFileMask) if not resultCF['OK']: if not self.ignoreapperrors: self.log.error(resultCF['Message']) self.setApplicationStatus(resultCF['Message']) return S_OK() fileDict = resultCF['Value'] resultFMD = self.getFileMetadata(fileDict) if not resultFMD['OK']: if not self.ignoreapperrors: self.log.error(resultFMD['Message']) self.setApplicationStatus(resultFMD['Message']) return S_OK() if not resultFMD['Value']: if not self.ignoreapperrors: self.log.info('No output data files were determined to be uploaded for this workflow') self.setApplicationStatus('No Output Data Files To Upload') return S_OK() fileMetadata = resultFMD['Value'] #First get the local (or assigned) SE to try first for upload and others in random fashion resultSEL = getDestinationSEList('Tier1-USER', DIRAC.siteName(), outputmode='local') if not resultSEL['OK']: self.log.error('Could not resolve output data SE', resultSEL['Message']) self.setApplicationStatus('Failed To Resolve OutputSE') return resultSEL localSE = resultSEL['Value'] orderedSEs = [ se for se in self.defaultOutputSE if se not in localSE and se not in self.userOutputSE] orderedSEs = localSE + List.randomize(orderedSEs) if self.userOutputSE: prependSEs = [] for userSE in self.userOutputSE: if userSE not in orderedSEs: prependSEs.append(userSE) orderedSEs = prependSEs + orderedSEs self.log.info('Ordered list of output SEs is: %s' % (', '.join(orderedSEs))) final = {} for fileName, metadata in fileMetadata.iteritems(): final[fileName] = metadata final[fileName]['resolvedSE'] = orderedSEs #At this point can exit and see exactly what the module will upload self.printOutputInfo(final) if not self.enable: return S_OK('Module is disabled by control flag') self.injectJobIndex( final ) #Instantiate the failover transfer client with the global request object failoverTransfer = FailoverTransfer(self._getRequestContainer()) #One by one upload the files with failover if necessary filesToReplicate = {} filesToFailover = {} filesUploaded = [] if not self.failoverTest: self.transferAndRegisterFiles(final, failoverTransfer, filesToFailover, filesUploaded, filesToReplicate) else: filesToFailover = final ##if there are files to be failovered, we do it now resultTRFF = self.transferRegisterAndFailoverFiles(failoverTransfer, filesToFailover, filesUploaded) cleanUp = resultTRFF['Value']['cleanUp'] #For files correctly uploaded must report LFNs to job parameters if filesUploaded: report = ', '.join( filesUploaded ) self.jobReport.setJobParameter( 'UploadedOutputData', report ) self.workflow_commons['Request'] = failoverTransfer.request #If some or all of the files failed to be saved to failover if cleanUp: #Leave any uploaded files just in case it is useful for the user #do not try to replicate any files. return S_ERROR('Failed To Upload Output Data') #If there is now at least one replica for uploaded files can trigger replication datMan = DataManager( catalogs = self.userFileCatalog ) self.log.info('Sleeping for 10 seconds before attempting replication of recently uploaded files') time.sleep(10) for lfn, repSE in filesToReplicate.items(): resultRAR = datMan.replicateAndRegister(lfn, repSE) if not resultRAR['OK']: self.log.info('Replication failed with below error but file already exists in Grid storage with \ at least one replica:\n%s' % (resultRAR)) self.generateFailoverFile() self.setApplicationStatus('Job Finished Successfully') return S_OK('Output data uploaded')
def execute(self): """ Main execution function. """ #Have to work out if the module is part of the last step i.e. #user jobs can have any number of steps and we only want #to run the finalization once. currentStep = int(self.step_commons['STEP_NUMBER']) totalSteps = int(self.workflow_commons['TotalSteps']) if currentStep == totalSteps: self.lastStep = True else: self.log.verbose( 'Current step = %s, total steps of workflow = %s, UserJobFinalization will enable itself only \ at the last workflow step.' % (currentStep, totalSteps)) if not self.lastStep: #Not last step, do nothing, proceed happily. return S_OK() result = self.resolveInputVariables() if not result['OK']: self.log.error("Failed to resolve input parameters:", result['Message']) return result self.log.info('Initializing %s' % self.version) if not self.workflowStatus['OK'] or not self.stepStatus['OK']: ##Something went wrong in the step or the workflow, do nothing. self.log.verbose( 'Workflow status = %s, step status = %s' % (self.workflowStatus['OK'], self.stepStatus['OK'])) return S_OK('No output data upload attempted') self.request.RequestName = 'job_%d_request.xml' % int(self.jobID) self.request.JobID = self.jobID self.request.SourceComponent = "Job_%d" % int(self.jobID) if not self.userOutputData: self.log.info( 'No user output data is specified for this job, nothing to do') return S_OK('No output data to upload') #Determine the final list of possible output files for the #workflow and all the parameters needed to upload them. outputList = [] possible_files = [] for i in self.userOutputData: files = getGlobbedFiles(i) for possible_file in files: if possible_file in possible_files: #Don't have twice the same file continue outputList.append({ 'outputDataType': i.split('.')[-1].upper( ), #this would be used to sort the files in different dirs 'outputDataSE': self.userOutputSE, 'outputFile': os.path.basename(possible_file) }) possible_files.append(os.path.basename(possible_file)) self.log.info('Constructing user output LFN(s) for %s' % (', '.join(self.userOutputData))) if not self.jobID: self.jobID = 12345 owner = '' if 'Owner' in self.workflow_commons: owner = self.workflow_commons['Owner'] else: res = getCurrentOwner() if not res['OK']: self.log.error('Could not find proxy') return S_ERROR('Could not obtain owner from proxy') owner = res['Value'] vo = '' if self.workflow_commons.has_key('VO'): vo = self.workflow_commons['VO'] else: res = getVOfromProxyGroup() if not res['OK']: self.log.error('Failed finding the VO') return S_ERROR('Could not obtain VO from proxy') vo = res['Value'] result = constructUserLFNs(int(self.jobID), vo, owner, possible_files, self.userOutputPath) if not result['OK']: self.log.error('Could not create user LFNs', result['Message']) return result userOutputLFNs = result['Value'] self.log.verbose('Calling getCandidateFiles( %s, %s)' % (outputList, userOutputLFNs)) result = self.getCandidateFiles(outputList, userOutputLFNs) if not result['OK']: if not self.ignoreapperrors: self.log.error(result['Message']) self.setApplicationStatus(result['Message']) return S_OK() fileDict = result['Value'] result = self.getFileMetadata(fileDict) if not result['OK']: if not self.ignoreapperrors: self.log.error(result['Message']) self.setApplicationStatus(result['Message']) return S_OK() if not result['Value']: if not self.ignoreapperrors: self.log.info( 'No output data files were determined to be uploaded for this workflow' ) self.setApplicationStatus('No Output Data Files To Upload') return S_OK() fileMetadata = result['Value'] orderedSEs = self.userOutputSE self.log.info('Ordered list of output SEs is: %s' % (', '.join(orderedSEs))) final = {} for fileName, metadata in fileMetadata.items(): final[fileName] = metadata final[fileName]['resolvedSE'] = orderedSEs #At this point can exit and see exactly what the module will upload if not self.enable: self.log.info( 'Module is disabled by control flag, would have attempted \ to upload the following files %s' % ', '.join(final.keys())) for fileName, metadata in final.items(): self.log.info('--------%s--------' % fileName) for n, v in metadata.items(): self.log.info('%s = %s' % (n, v)) return S_OK('Module is disabled by control flag') #Instantiate the failover transfer client with the global request object failoverTransfer = FailoverTransfer(self.request) #One by one upload the files with failover if necessary replication = {} failover = {} uploaded = [] if not self.failoverTest: for fileName, metadata in final.items(): self.log.info( "Attempting to store file %s to the following SE(s):\n%s" % (fileName, ', '.join(metadata['resolvedSE']))) replicateSE = '' result = failoverTransfer.transferAndRegisterFile( fileName, metadata['localpath'], metadata['lfn'], metadata['resolvedSE'], fileMetaDict=metadata, fileCatalog=self.userFileCatalog) if not result['OK']: self.log.error( 'Could not transfer and register %s with metadata:\n %s' % (fileName, metadata)) failover[fileName] = metadata else: #Only attempt replication after successful upload lfn = metadata['lfn'] uploaded.append(lfn) seList = metadata['resolvedSE'] if result['Value'].has_key('uploadedSE'): uploadedSE = result['Value']['uploadedSE'] for se in seList: if not se == uploadedSE: replicateSE = se break if replicateSE and lfn: self.log.info('Will attempt to replicate %s to %s' % (lfn, replicateSE)) replication[lfn] = replicateSE else: failover = final cleanUp = False for fileName, metadata in failover.items(): random.shuffle(self.failoverSEs) targetSE = metadata['resolvedSE'][0] metadata['resolvedSE'] = self.failoverSEs result = failoverTransfer.transferAndRegisterFileFailover( fileName, metadata['localpath'], metadata['lfn'], targetSE, self.failoverSEs, fileMetaDict=metadata, fileCatalog=self.userFileCatalog) if not result['OK']: self.log.error( 'Could not transfer and register %s with metadata:\n %s' % (fileName, metadata)) cleanUp = True continue #for users can continue even if one completely fails else: lfn = metadata['lfn'] uploaded.append(lfn) #For files correctly uploaded must report LFNs to job parameters if uploaded: report = ', '.join(uploaded) self.jobReport.setJobParameter('UploadedOutputData', report) self.request = failoverTransfer.request #If some or all of the files failed to be saved to failover if cleanUp: self.workflow_commons['Request'] = self.request #Leave any uploaded files just in case it is useful for the user #do not try to replicate any files. return S_ERROR('Failed To Upload Output Data') #If there is now at least one replica for uploaded files can trigger replication rm = ReplicaManager() self.log.info( 'Sleeping for 10 seconds before attempting replication of recently uploaded files' ) time.sleep(10) for lfn, repSE in replication.items(): result = rm.replicateAndRegister(lfn, repSE, catalog=self.userFileCatalog) if not result['OK']: self.log.info( 'Replication failed with below error but file already exists in Grid storage with \ at least one replica:\n%s' % (result)) self.workflow_commons['Request'] = self.request self.generateFailoverFile() self.setApplicationStatus('Job Finished Successfully') return S_OK('Output data uploaded')