def processWorker(inputs, results, resthost, resturi): """Wait for an reference to appear in the input queue, call the referenced object and write the output in the output queue. :arg Queue inputs: the queue where the inputs are shared by the master :arg Queue results: the queue where this method writes the output :return: default returning zero, but not really needed.""" logger = logging.getLogger() procName = multiprocessing.current_process().name while True: try: workid, work, task, inputargs = inputs.get() except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logger.error(crashMessage) break if work == 'STOP': break outputs = None t0 = time.time() logger.debug("%s: Starting %s on %s" %(procName, str(work), task['tm_taskname'])) try: msg = None outputs = work(resthost, resturi, WORKER_CONFIG, task, inputargs) except WorkerHandlerException, we: outputs = Result(task=task, err=str(we)) msg = str(we) except Exception, exc: outputs = Result(task=task, err=str(exc)) msg = "%s: I just had a failure for %s" % (procName, str(exc)) msg += "\n\tworkid=" + str(workid) msg += "\n\ttask=" + str(task['tm_taskname']) msg += "\n" + str(traceback.format_exc())
def execute(self, *args, **kwargs): results = [] for jgroup in args[0]: possiblesites = jgroup.jobs[0]['input_files'][0]['locations'] self.logger.debug("Possible sites == " + str(possiblesites)) if len(possiblesites) == 0: msg = "DLS retourned no sites for the block" self.logger.error(msg) results.append( Result(task=kwargs['task'], result=(jgroup, None, []), err=msg)) continue #use resubmiti white/black lists if we have them siteWhitelist = kwargs['task']['tm_site_whitelist'] if not kwargs[ 'task']['resubmit_site_whitelist'] else kwargs['task'][ 'resubmit_site_whitelist'] siteBlacklist = kwargs['task']['tm_site_blacklist'] if not kwargs[ 'task']['resubmit_site_blacklist'] else kwargs['task'][ 'resubmit_site_blacklist'] self.logger.debug("white list == %s" % set(siteWhitelist)) self.logger.debug("black list == %s" % set(siteBlacklist)) availablesites = list( set(possiblesites) & set(siteWhitelist) if siteWhitelist else set(possiblesites) - set(siteBlacklist)) self.logger.info('Available sites == %s' % str(availablesites)) fixedsites = set(self.config.Sites.available) availablesites = list(set(availablesites) & fixedsites) if len(availablesites) == 0: msg = "No site available before brokering, will skip injection. Check White/Back lists" self.logger.error(msg) results.append( Result(task=kwargs['task'], result=(jgroup, None, []), err=msg)) continue self.logger.info( "Asking best site to PanDA between %s. Using %s as pandaserver." % (str(availablesites), self.pandaurls['baseURLSSL'])) selectedsite = runBrokerage( self.pandaurls['baseURLSSL'], proxy=kwargs['task']['user_proxy'], sites=self.translateSiteName(availablesites))[-1] self.logger.info("Choosed site after brokering " + str(selectedsite)) if not selectedsite: msg = "No site available after brokering, will skip injection" self.logger.error(msg) results.append( Result(task=kwargs['task'], result=(jgroup, None, []), err=msg)) continue else: results.append( Result(task=kwargs['task'], result=(jgroup, selectedsite, availablesites))) return results
def execute(self, resthost, resturi, config, task, procnum): try: self.logger.info("Executing %s" % task) self._execute(resthost, resturi, config, task) return Result(task=task['tm_taskname'], result="OK") except Exception as ex: self.logger.error("Error while runnig recurring action.") self.logger.exception(ex) return Result(task=task['tm_taskname'], result="KO")
def execute(self, resthost, dbinstance, config, task, procnum): try: self.logger.info("Executing %s", task) self._execute(config, task) return Result(task=task['tm_taskname'], result="OK") except Exception as ex: self.logger.error("Error while runnig recurring action.") self.logger.exception(ex) return Result(task=task['tm_taskname'], err="RecurringAction FAILED")
def execute(self, *args, **kwargs): self.logger.info("Getting already existing specs ") status, pandaspecs = getFullJobStatus( self.backendurls['baseURLSSL'], ids=kwargs['task']['resubmit_ids'], proxy=kwargs['task']['user_proxy']) return Result(task=kwargs['task'], result=pandaspecs)
def executeInternal(self, *args, **kw): tempDir = args[0][0] inputFiles = args[0][3] splitterResult = args[0][4] cwd = os.getcwd() try: os.chdir(tempDir) splittingSummary = SplittingSummary(kw['task']['tm_split_algo']) for jobgroup in splitterResult: jobs = jobgroup.getJobs() splittingSummary.addJobs(jobs) splittingSummary.dump('splitting-summary.json') inputFiles.append('splitting-summary.json') self.packSandbox(inputFiles) self.logger.info('Uploading dry run tarball to the user file cache') ufc = UserFileCache(dict={'cert': kw['task']['user_proxy'], 'key': kw['task']['user_proxy'], 'endpoint': kw['task']['tm_cache_url']}) result = ufc.uploadLog('dry-run-sandbox.tar.gz') os.remove('dry-run-sandbox.tar.gz') if 'hashkey' not in result: raise TaskWorkerException('Failed to upload dry-run-sandbox.tar.gz to the user file cache: ' + str(result)) else: self.logger.info('Uploaded dry run tarball to the user file cache: ' + str(result)) update = {'workflow': kw['task']['tm_taskname'], 'subresource': 'state', 'status': 'UPLOADED'} self.logger.debug('Updating task status: %s' % str(update)) self.server.post(self.resturi, data=urllib.urlencode(update)) finally: os.chdir(cwd) return Result(task=kw['task'], result=args[0])
def execute(self, *args, **kwargs): wmwork = Workflow(name=kwargs['task']['tm_taskname']) wmsubs = Subscription(fileset=args[0], workflow=wmwork, split_algo=kwargs['task']['tm_split_algo'], type=self.jobtypeMapper[kwargs['task']['tm_job_type']]) splitter = SplitterFactory() jobfactory = splitter(subscription=wmsubs) splitparam = kwargs['task']['tm_split_args'] splitparam['algorithm'] = kwargs['task']['tm_split_algo'] factory = jobfactory(**splitparam) if len(factory) == 0: # Understanding that no jobs could be created given the splitting arguments # with the given input dataset information: NO IDEA WHY. # NB: we assume that split can't happen, then task is failed msg = "Splitting %s on %s with %s does not generate any job" %(kwargs['task']['tm_taskname'], kwargs['task']['tm_input_dataset'], kwargs['task']['tm_split_algo']) self.logger.error("Setting %s as failed" % str(kwargs['task']['tm_taskname'])) configreq = {'workflow': kwargs['task']['tm_taskname'], 'status': "FAILED", 'subresource': 'failure', 'failure': b64encode(msg)} self.server.post(self.resturl, data = urllib.urlencode(configreq)) raise StopHandler(msg) return Result(task=kwargs['task'], result=factory)
def execute(self, *args, **kwargs): #pylint: disable=unused-argument # since https://github.com/dmwm/CRABServer/issues/5633 totalunits can be a float # but that would confuse WMCore, therefore cast to int totalevents = int(kwargs['task']['tm_totalunits']) firstEvent = 1 lastEvent = totalevents firstLumi = 1 lastLumi = 10 # Set a default of 100 events per lumi. This is set as a task # property, as the splitting considers it independently of the file # information provided by the fake dataset. if not kwargs['task']['tm_events_per_lumi']: kwargs['task']['tm_events_per_lumi'] = 100 #MC comes with only one MCFakeFile singleMCFileset = Fileset(name="MCFakeFileSet") newFile = File("MCFakeFile", size=1000, events=totalevents) newFile.setLocation(self.getListOfSites()) newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["block"] = 'MCFakeBlock' newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent singleMCFileset.addFile(newFile) return Result(task=kwargs['task'], result=singleMCFileset)
def execute(self, *args, **kwargs): result = None proxycfg = {'vo': kwargs['task']['tm_user_vo'], 'logger': self.logger, 'myProxySvr': self.config.Services.MyProxy, 'proxyValidity' : '144:0', 'min_time_left' : 36000, ## do we need this ? or should we use self.myproxylen? 'userDN' : kwargs['task']['tm_user_dn'], 'group' : kwargs['task']['tm_user_group'] if kwargs['task']['tm_user_group'] else '', 'role' : kwargs['task']['tm_user_role'] if kwargs['task']['tm_user_role'] else '', 'server_key': self.config.MyProxy.serverhostkey, 'server_cert': self.config.MyProxy.serverhostcert, 'serverDN': self.config.MyProxy.serverdn, 'uisource': getattr(self.config.MyProxy, 'uisource', ''), 'credServerPath': self.config.MyProxy.credpath, 'myproxyAccount' : self.server['host'], 'cleanEnvironment' : getattr(self.config.MyProxy, 'cleanEnvironment', False) } proxy = Proxy(proxycfg) userproxy = proxy.getProxyFilename(serverRenewer=True) proxy.logonRenewMyProxy() timeleft = proxy.getTimeLeft(userproxy) if timeleft is None or timeleft <= 0: msg = "Impossible to retrieve proxy from %s for %s." % (proxycfg['myProxySvr'], proxycfg['userDN']) raise TaskWorkerException(msg) else: kwargs['task']['user_proxy'] = userproxy result = Result(task=kwargs['task'], result='OK') return result
def execute(self, *args, **kwargs): totalevents = kwargs['task']['tm_totalunits'] firstEvent = 1 lastEvent = totalevents firstLumi = 1 lastLumi = 10 # Set a default of 100 events per lumi. This is set as a task # property, as the splitting considers it independently of the file # information provided by the fake dataset. if not kwargs['task']['tm_events_per_lumi']: kwargs['task']['tm_events_per_lumi'] = 100 #MC comes with only one MCFakeFile singleMCFileset = Fileset(name = "MCFakeFileSet") newFile = File("MCFakeFile", size = 1000, events = totalevents) if hasattr(self.config.Sites, 'available'): newFile.setLocation(self.config.Sites.available) else: sbj = SiteDBJSON({"key":self.config.TaskWorker.cmskey, "cert":self.config.TaskWorker.cmscert}) newFile.setLocation(sbj.getAllCMSNames()) newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["block"] = 'MCFackBlock' newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent singleMCFileset.addFile(newFile) return Result(task=kwargs['task'], result=singleMCFileset)
class Splitter(TaskAction): """Performing the split operation depending on the recevied input and arguments""" def execute(self, *args, **kwargs): wmwork = Workflow(name=kwargs['task']['tm_taskname']) wmsubs = Subscription( fileset=args[0], workflow=wmwork, split_algo=kwargs['task']['tm_split_algo'], type=self.jobtypeMapper[kwargs['task']['tm_job_type']]) splitter = SplitterFactory() jobfactory = splitter(subscription=wmsubs) splitparam = kwargs['task']['tm_split_args'] splitparam['algorithm'] = kwargs['task']['tm_split_algo'] if kwargs['task']['tm_job_type'] == 'Analysis': if kwargs['task']['tm_split_algo'] == 'FileBased': splitparam['total_files'] = kwargs['task']['tm_totalunits'] elif kwargs['task']['tm_split_algo'] == 'LumiBased': splitparam['total_lumis'] = kwargs['task']['tm_totalunits'] elif kwargs['task']['tm_job_type'] == 'PrivateMC': if 'tm_events_per_lumi' in kwargs['task'] and kwargs['task'][ 'tm_events_per_lumi']: splitparam['events_per_lumi'] = kwargs['task'][ 'tm_events_per_lumi'] if 'tm_generator' in kwargs['task'] and kwargs['task'][ 'tm_generator'] == 'lhe': splitparam['lheInputFiles'] = True splitparam['applyLumiCorrection'] = True factory = jobfactory(**splitparam) if len(factory) == 0: raise TaskWorkerException("The CRAB3 server backend could not submit any job to the Grid scheduler:\n"+\ "splitting task %s on dataset %s with %s method does not generate any job") #printing duplicated lumis if any lumiChecker = getattr(jobfactory, 'lumiChecker', None) if lumiChecker and lumiChecker.splitLumiFiles: self.logger.warning( "The input dataset contains the following duplicated lumis %s" % lumiChecker.splitLumiFiles.keys()) try: configreq = { 'subresource': 'addwarning', 'workflow': kwargs['task']['tm_taskname'], 'warning': b64encode( 'The CRAB3 server backend detected lumis split across files in the input dataset.' ' Will apply the necessary corrections in the splitting algorithms' ) } self.server.post(self.restURInoAPI + '/task', data=urllib.urlencode(configreq)) except Exception, e: self.logger.error(e.headers) self.logger.warning( "Cannot add warning to REST after finding duplicates") return Result(task=kwargs['task'], result=factory)
def executeInternal(self, *args, **kw): inputFiles = args[0][2] splitterResult = args[0][3][0] cwd = os.getcwd() try: os.chdir(kw['tempDir']) splittingSummary = SplittingSummary(kw['task']['tm_split_algo']) for jobgroup in splitterResult: jobs = jobgroup.getJobs() splittingSummary.addJobs(jobs) splittingSummary.dump('splitting-summary.json') inputFiles.append('splitting-summary.json') self.packSandbox(inputFiles) self.logger.info( 'Uploading dry run tarball to the user file cache') if 'S3' in kw['task']['tm_cache_url'].upper(): uploadToS3(crabserver=self.crabserver, filepath='dry-run-sandbox.tar.gz', objecttype='runtimefiles', taskname=kw['task']['tm_taskname'], logger=self.logger) result = { 'hashkey': 'ok' } # a dummy one to keep same semantics as when using UserFileCache os.remove('dry-run-sandbox.tar.gz') else: ufc = UserFileCache( mydict={ 'cert': kw['task']['user_proxy'], 'key': kw['task']['user_proxy'], 'endpoint': kw['task']['tm_cache_url'] }) result = ufc.uploadLog('dry-run-sandbox.tar.gz') os.remove('dry-run-sandbox.tar.gz') if 'hashkey' not in result: raise TaskWorkerException( 'Failed to upload dry-run-sandbox.tar.gz to the user file cache: ' + str(result)) self.logger.info( 'Uploaded dry run tarball to the user file cache: %s', str(result)) update = { 'workflow': kw['task']['tm_taskname'], 'subresource': 'state', 'status': 'UPLOADED' } self.logger.debug('Updating task status: %s', str(update)) self.crabserver.post(api='workflowdb', data=urllib.urlencode(update)) finally: os.chdir(cwd) return Result(task=kw['task'], result=args[0])
def execute(self, *args, **kwargs): """ This Action does something useful in case the user did not specify a lumi mask. In this case the report command is meaningless as it will never report if there are unanalyzed lumis. So, we build a lumimask starting from the infos coming from the DBS discovery and we push it to the crab REST interface. """ files = args[0] if not kwargs['task']['tm_split_args']['lumis'] and not kwargs['task'][ 'tm_split_args']['runs']: self.logger.info( "Reconstructing lumimask as the user did not specify it") lumilists = {} self.runs = [] self.lumis = [] #Take all the files and create one dict containing all the lumi to analyze for f in files: for run in f['runs']: if run.run not in lumilists: lumilists[run.run] = run.lumis else: lumilists[run.run] += run.lumis self.logger.debug("Lumilist reconstructed: %s" % lumilists) #Take the dict containing the lumilist (format: {run1 : [lumi1, lumi2 ...], run2 : [lumi1, lumi2 ...] ...}), #group the lumis in the same range (1,2,3,4 => [1,4]) and prepare the runs and lumis as expected by the REST. Example: #Input: #lumilist = {2L: [1L, 2L, 3L, 8L, 9L, 4L, 5L, 20L, 21L, 22L], 3L: [11L, 12L, 13L], 4L: [1L, 2L, 5L, 6L, 7L, 100L]} #Output: #runs = [2', '3', '4'] #lumis = ['1,5,8,9,20,22', '11,13', '1,2,5,7,100,100'] for run in lumilists: self.runs.append(str(run)) self.lumis.append(','.join([ (lambda currLumi=consLumis.next(), numConsLumi=sum( 1 for _ in consLumis): "%s,%s" % (currLumi, currLumi + numConsLumi))() for _, consLumis in groupby( sorted(lumilists[run]), lambda x, c=count(): c.next() - x) ])) configreq = { 'workflow': kwargs['task']['tm_taskname'], 'subresource': 'lumimask', } self.server.post(self.resturl, data=urllib.urlencode(configreq) + '&runs='.join([''] + self.runs) + '&lumis='.join([''] + self.lumis)) result = Result(task=kwargs['task'], result=files) return result
def execute(self, *args, **kwargs): result = None proxycfg = { 'vo': kwargs['task']['tm_user_vo'], 'logger': self.logger, 'myProxySvr': self.config.Services.MyProxy, 'proxyValidity': '24:0', 'min_time_left': 36000, ## do we need this ? or should we use self.myproxylen? 'userDN': kwargs['task']['tm_user_dn'], 'group': kwargs['task']['tm_user_group'] if kwargs['task']['tm_user_group'] else '', 'role': kwargs['task']['tm_user_role'] if kwargs['task']['tm_user_role'] else '', 'server_key': self.config.MyProxy.serverhostkey, 'server_cert': self.config.MyProxy.serverhostcert, 'serverDN': self.config.MyProxy.serverdn, 'uisource': self.config.MyProxy.uisource, 'credServerPath': self.config.MyProxy.credpath, } proxy = Proxy(proxycfg) userproxy = proxy.getProxyFilename(serverRenewer=True) proxy.logonRenewMyProxy() timeleft = proxy.getTimeLeft(userproxy) if timeleft is None or timeleft <= 0: msg = "Impossible to retrieve proxy from %s for %s." % ( proxycfg['myProxySvr'], proxycfg['userDN']) self.logger.error("Setting %s as failed" % str(kwargs['task']['tm_taskname'])) configreq = { 'workflow': kwargs['task']['tm_taskname'], 'status': "FAILED", 'subresource': 'failure', 'failure': b64encode(msg) } self.logger.error(str(configreq)) self.server.post(self.resturl, data=urllib.urlencode(configreq)) raise StopHandler(msg) else: kwargs['task']['user_proxy'] = userproxy result = Result(task=kwargs['task'], result='OK') return result
def execute(self, *args, **kwargs): self.logger.info( "Data discovery and splitting for %s using user-provided files" % kwargs['task']['tm_taskname']) userfiles = kwargs['task']['tm_arguments'].get('userfiles') splitting = kwargs['task']['tm_split_algo'] total_units = kwargs['task']['tm_totalunits'] if not userfiles or splitting != 'FileBased': if not userfiles: msg = "No files specified to process for task %s." % kwargs[ 'task']['tm_taskname'] if splitting != 'FileBased': msg = "Data.splitting must be set to 'FileBased' when using a custom set of files." self.logger.error("Setting %s as failed: %s" % (kwargs['task']['tm_taskname'], msg)) configreq = { 'workflow': kwargs['task']['tm_taskname'], 'status': "FAILED", 'subresource': 'failure', 'failure': b64encode(msg) } self.server.post(self.resturi, data=urllib.urlencode(configreq)) raise StopHandler(msg) if hasattr(self.config.Sites, 'available'): locations = self.config.Sites.available else: sbj = SiteDBJSON({ "key": self.config.TaskWorker.cmskey, "cert": self.config.TaskWorker.cmscert }) locations = sbj.getAllCMSNames() userFileset = Fileset(name=kwargs['task']['tm_taskname']) self.logger.info("There are %d files specified by the user." % len(userfiles)) if total_units > 0: self.logger.info("Will run over the first %d files." % total_units) file_counter = 0 for userfile, idx in zip(userfiles, range(len(userfiles))): newFile = File(userfile, size=1000, events=1) newFile.setLocation(locations) newFile.addRun(Run(1, idx)) newFile["block"] = 'UserFilesFakeBlock' newFile["first_event"] = 1 newFile["last_event"] = 2 userFileset.addFile(newFile) file_counter += 1 if total_units > 0 and file_counter >= total_units: break return Result(task=kwargs['task'], result=userFileset)
class DataDiscovery(TaskAction): """I am the abstract class for the data discovery. Taking care of generalizing different data discovery possibilities. Implementing only a common method to return a properly formatted output.""" def formatOutput(self, task, requestname, datasetfiles, locations): """Receives as input the result of the data location discovery operations and fill up the WMCore objects.""" self.logger.debug(" Formatting data discovery output ") # TEMPORARY secmsmap = {} sbj = SiteDBJSON({"key":self.config.MyProxy.serverhostkey, "cert":self.config.MyProxy.serverhostcert}) wmfiles = [] lumicounter = evecounter = 0 for lfn, infos in datasetfiles.iteritems(): wmfile = File(lfn=lfn, events=infos['NumberOfEvents'], size=infos['Size'], checksums=infos['Checksums']) wmfile['block'] = infos['BlockName'] wmfile['locations'] = [] if locations.has_key(infos['BlockName']): for se in locations[infos['BlockName']]: if se not in secmsmap: self.logger.debug("Translating SE %s" %se) try: secmsmap[se] = sbj.seToCMSName(se) except KeyError, ke: self.logger.error("Impossible translating %s to a CMS name through SiteDB" %se) secmsmap[se] = '' if se in secmsmap: if type(secmsmap[se]) == list: wmfile['locations'].extend(secmsmap[se]) else: wmfile['locations'].append(secmsmap[se]) wmfile['workflow'] = requestname evecounter += infos['NumberOfEvents'] for run, lumis in infos['Lumis'].iteritems(): #self.logger.debug(' - adding run %d and lumis %s' %(run, lumis)) wmfile.addRun(Run(run, *lumis)) lumicounter += len(lumis) wmfiles.append(wmfile) self.logger.debug('Tot events found: %d' %evecounter) self.logger.debug('Tot lumis found: %d' %lumicounter) self.logger.debug('Tot files found: %d' %len(wmfiles)) return Result(task=task, result=Fileset(name='FilesToSplit', files = set(wmfiles)))
def actionWork(self, *args, **kwargs): """Performing the set of actions""" nextinput = args for work in self.getWorks(): self.logger.debug("Starting %s on %s" % (str(work), self._task['tm_taskname'])) t0 = time.time() try: output = work.execute(nextinput, task=self._task) except StopHandler, sh: msg = "Controlled stop of handler for %s on %s " % (self._task, str(sh)) self.logger.error(msg) nextinput = Result(task=self._task, result='StopHandler exception received, controlled stop') break #exit normally. Worker will not notice there was an error except TaskWorkerException, twe: self.logger.debug(str(traceback.format_exc())) #print the stacktrace only in debug mode raise WorkerHandlerException(str(twe)) #TaskWorker error, do not add traceback to the error propagated to the REST
def execute(self, *args, **kwargs): self.logger.info( "Data discovery and splitting for %s using user-provided files" % kwargs['task']['tm_taskname']) userfiles = kwargs['task']['tm_user_files'] splitting = kwargs['task']['tm_split_algo'] total_units = kwargs['task']['tm_totalunits'] if not userfiles or splitting != 'FileBased': if not userfiles: msg = "No files specified to process for task %s." % kwargs[ 'task']['tm_taskname'] if splitting != 'FileBased': msg = "Data.splitting must be set to 'FileBased' when using a custom set of files." raise TaskWorkerException(msg) if hasattr(self.config.Sites, 'available'): locations = self.config.Sites.available else: with self.config.TaskWorker.envForCMSWEB: configDict = { "cacheduration": 1, "pycurl": True } # cache duration is in hours resourceCatalog = CRIC(logger=self.logger, configDict=configDict) locations = resourceCatalog.getAllPSNs() userFileset = Fileset(name=kwargs['task']['tm_taskname']) self.logger.info("There are %d files specified by the user." % len(userfiles)) if total_units > 0: self.logger.info("Will run over the first %d files." % total_units) file_counter = 0 for userfile, idx in zip(userfiles, range(len(userfiles))): newFile = File(userfile, size=1000, events=1) newFile.setLocation(locations) newFile.addRun(Run(1, idx)) newFile["block"] = 'UserFilesFakeBlock' newFile["first_event"] = 1 newFile["last_event"] = 2 userFileset.addFile(newFile) file_counter += 1 if total_units > 0 and file_counter >= total_units: break return Result(task=kwargs['task'], result=userFileset)
def execute(self, *args, **kwargs): totalevents = kwargs['task']['tm_totalunits'] firstEvent = 1 lastEvent = totalevents firstLumi = 1 lastLumi = 10 #MC comes with only one MCFakeFile singleMCFileset = Fileset(name = "MCFakeFileSet") newFile = File("MCFakeFile", size = 1000, events = totalevents) newFile.setLocation(self.config.Sites.available) newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["block"] = 'MCFackBlock' newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent singleMCFileset.addFile(newFile) return Result(task=kwargs['task'], result=singleMCFileset)
def actionWork(self, *args, **kwargs): """Performing the set of actions""" nextinput = args for work in self.getWorks(): self.logger.debug("Starting %s on %s" % (str(work), self._task['tm_taskname'])) t0 = time.time() try: output = work.execute(nextinput, task=self._task) except StopHandler, sh: msg = "Controlled stop of handler for %s on %s " % (self._task, str(sh)) self.logger.error(msg) nextinput = Result( task=self._task, result='StopHandler exception received, controlled stop') break except Exception, exc: msg = "Problem handling %s because of %s failure, tracebak follows\n" % ( self._task['tm_taskname'], str(exc)) msg += str(traceback.format_exc()) self.logger.error(msg) raise WorkerHandlerException(msg)
def execute(self, *args, **kwargs): result = None proxycfg = { 'vo': kwargs['task']['tm_user_vo'], 'logger': self.logger, 'myProxySvr': self.config.Services.MyProxy, 'proxyValidity': '144:0', 'min_time_left': 36000, ## do we need this ? or should we use self.myproxylen? 'userName': kwargs['task']['tm_username'] + '_CRAB', 'userDN': kwargs['task']['tm_user_dn'], 'group': kwargs['task']['tm_user_group'] if kwargs['task']['tm_user_group'] else '', 'role': kwargs['task']['tm_user_role'] if kwargs['task']['tm_user_role'] else '', 'server_key': self.config.MyProxy.serverhostkey, 'server_cert': self.config.MyProxy.serverhostcert, 'serverDN': 'dummy', # this is only used inside WMCore/Proxy.py functions not used by CRAB 'uisource': getattr(self.config.MyProxy, 'uisource', ''), 'credServerPath': self.config.MyProxy.credpath, 'cleanEnvironment': getattr(self.config.MyProxy, 'cleanEnvironment', False) } try: self.logger.info( "try first to retrieve credential with login name %s", proxycfg['userName']) (userproxy, usergroups) = self.tryProxyLogon(proxycfg=proxycfg) except TaskWorkerException: self.logger.error( "proxy retrieval from %s failed with login name %s.", proxycfg['myProxySvr'], proxycfg['userName']) self.logger.error("will try with old-style DN hash") del proxycfg['userName'] try: (userproxy, usergroups) = self.tryProxyLogon(proxycfg=proxycfg) except TaskWorkerException as ex: self.logger.error( "proxy retrieval from %s failed with DN hash as credential name.", proxycfg['myProxySvr']) raise TaskWorkerException(str(ex)) # minimal sanity check. Submission will fail if there's no group if not usergroups: raise TaskWorkerException( 'Could not retrieve VOMS groups list from %s' % userproxy) kwargs['task']['user_proxy'] = userproxy kwargs['task']['user_groups'] = usergroups self.logger.debug("Valid proxy for %s now in %s", proxycfg['userDN'], userproxy) result = Result(task=kwargs['task'], result='OK') return result
def execute(self, *args, **kwargs): wmwork = Workflow(name=kwargs['task']['tm_taskname']) maxJobs = getattr(self.config.TaskWorker, 'maxJobsPerTask', 10000) data = args[0] splitparam = kwargs['task']['tm_split_args'] splitparam['algorithm'] = kwargs['task']['tm_split_algo'] if kwargs['task']['tm_job_type'] == 'Analysis': totalUnits = kwargs['task']['tm_totalunits'] if kwargs['task']['tm_split_algo'] == 'FileBased': if totalUnits < 1.0: totalUnits = int(totalUnits * len(data.getFiles()) + 0.5) splitparam['total_files'] = totalUnits elif kwargs['task']['tm_split_algo'] == 'LumiBased': if totalUnits < 1.0: totalUnits = int(totalUnits * sum(len(run.lumis) for f in data.getFiles() for run in f['runs']) + 0.5) splitparam['total_lumis'] = totalUnits elif kwargs['task']['tm_split_algo'] == 'EventAwareLumiBased': if totalUnits < 1.0: totalUnits = int(totalUnits * sum(f['events'] for f in data.getFiles()) + 0.5) splitparam['total_events'] = totalUnits elif kwargs['task']['tm_split_algo'] == 'Automatic': # REST backwards compatibility fix if 'seconds_per_job' in kwargs['task']['tm_split_args']: kwargs['task']['tm_split_args']['minutes_per_job'] = kwargs['task']['tm_split_args'].pop('seconds_per_job') splitparam['algorithm'] = 'FileBased' splitparam['total_files'] = len(data.getFiles()) numProbes = getattr(self.config.TaskWorker, 'numAutomaticProbes', 5) splitparam['files_per_job'] = (len(data.getFiles()) + numProbes - 1) // numProbes elif kwargs['task']['tm_job_type'] == 'PrivateMC': if 'tm_events_per_lumi' in kwargs['task'] and kwargs['task']['tm_events_per_lumi']: splitparam['events_per_lumi'] = kwargs['task']['tm_events_per_lumi'] if 'tm_generator' in kwargs['task'] and kwargs['task']['tm_generator'] == 'lhe': splitparam['lheInputFiles'] = True splitparam['applyLumiCorrection'] = True wmsubs = Subscription(fileset=data, workflow=wmwork, split_algo=splitparam['algorithm'], type=self.jobtypeMapper[kwargs['task']['tm_job_type']]) try: splitter = SplitterFactory() jobfactory = splitter(subscription=wmsubs) factory = jobfactory(**splitparam) numJobs = sum([len(jobgroup.getJobs()) for jobgroup in factory]) except RuntimeError: msg = "The splitting on your task generated more than {0} jobs (the maximum).".format(maxJobs) raise TaskWorkerException(msg) if numJobs == 0: msg = "The CRAB3 server backend could not submit any job to the Grid scheduler:" msg += " splitting task %s" % (kwargs['task']['tm_taskname']) if kwargs['task']['tm_input_dataset']: msg += " on dataset %s" % (kwargs['task']['tm_input_dataset']) msg += " with %s method does not generate any job. See\n" % (kwargs['task']['tm_split_algo']) msg += "https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3FAQ#crab_submit_fails_with_Splitting" raise TaskWorkerException(msg) elif numJobs > maxJobs: raise TaskWorkerException("The splitting on your task generated %s jobs. The maximum number of jobs in each task is %s" % (numJobs, maxJobs)) minRuntime = getattr(self.config.TaskWorker, 'minAutomaticRuntimeMins', 180) if kwargs['task']['tm_split_algo'] == 'Automatic' and \ kwargs['task']['tm_split_args']['minutes_per_job'] < minRuntime: msg = "Minimum runtime requirement for automatic splitting is {0} minutes.".format(minRuntime) raise TaskWorkerException(msg) #printing duplicated lumis if any lumiChecker = getattr(jobfactory, 'lumiChecker', None) if lumiChecker and lumiChecker.splitLumiFiles: self.logger.warning("The input dataset contains the following duplicated lumis %s", lumiChecker.splitLumiFiles.keys()) msg = "The CRAB3 server backend detected lumis split across files in the input dataset." msg += " Will apply the necessary corrections in the splitting algorithm. You can ignore this message." self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname']) return Result(task = kwargs['task'], result = (factory, args[0]))
def processWorkerLoop(inputs, results, resthost, dbInstance, procnum, logger, logsDir): procName = "Process-%s" % procnum while True: try: ## Get (and remove) an item from the input queue. If the queue is empty, wait ## until an item is available. Item content is: ## workid : an integer assigne by the queue module ## work : a function handler to the needed action e.g. function handleNewTask ## task : a task dictionary ## failstatus : the status to assign to the task if work fails (e.g. 'SUBMITFAILED') workid, work, task, failstatus, inputargs = inputs.get() if work == 'STOP': break taskhandler = addTaskLogHandler(logger, task['tm_username'], task['tm_taskname'], logsDir) except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logger.error(crashMessage) break outputs = None t0 = time.time() #log entry below is used for logs parsing, therefore, changing it might require to update logstash configuration logger.debug("%s: Starting %s on %s", procName, str(work), task['tm_taskname']) try: msg = None outputs = work(resthost, dbInstance, WORKER_CONFIG, task, procnum, inputargs) except TapeDatasetException as tde: outputs = Result(task=task, err=str(tde)) except WorkerHandlerException as we: outputs = Result(task=task, err=str(we)) msg = str(we) except Exception as exc: #pylint: disable=broad-except outputs = Result(task=task, err=str(exc)) msg = "%s: I just had a failure for %s" % (procName, str(exc)) msg += "\n\tworkid=" + str(workid) msg += "\n\ttask=" + str(task['tm_taskname']) msg += "\n" + str(traceback.format_exc()) finally: if msg: crabserver = CRABRest(resthost, WORKER_CONFIG.TaskWorker.cmscert, WORKER_CONFIG.TaskWorker.cmskey, retry=20, logger=logger, userAgent='CRABTaskWorker') crabserver.setDbInstance(dbInstance) failTask(task['tm_taskname'], crabserver, msg, logger, failstatus) t1 = time.time() workType = task.get('tm_task_command', 'RECURRING') #log entry below is used for logs parsing, therefore, changing it might require to update logstash configuration logger.debug("%s: %s work on %s completed in %d seconds: %s", procName, workType, task['tm_taskname'], t1-t0, outputs) try: out, _, _ = executeCommand("ps u -p %s | awk '{sum=sum+$6}; END {print sum/1024}'" % os.getpid()) msg = "RSS after finishing %s: %s MB" % (task['tm_taskname'], out.strip()) logger.debug(msg) except Exception: logger.exception("Problem getting worker RSS:") removeTaskLogHandler(logger, taskhandler) results.put({ 'workid': workid, 'out' : outputs })
def processWorkerLoop(inputs, results, resthost, resturi, procnum, logger): procName = "Process-%s" % procnum while True: try: ## Get (and remove) an item from the input queue. If the queue is empty, wait ## until an item is available. workid, work, task, failstatus, inputargs = inputs.get() if work == 'STOP': break taskhandler = addTaskLogHandler(logger, task['tm_username'], task['tm_taskname']) except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logger.error(crashMessage) break outputs = None t0 = time.time() logger.debug("%s: Starting %s on %s", procName, str(work), task['tm_taskname']) try: msg = None outputs = work(resthost, resturi, WORKER_CONFIG, task, procnum, inputargs) except TapeDatasetException as tde: outputs = Result(task=task, err=str(tde)) except WorkerHandlerException as we: outputs = Result(task=task, err=str(we)) msg = str(we) except Exception as exc: #pylint: disable=broad-except outputs = Result(task=task, err=str(exc)) msg = "%s: I just had a failure for %s" % (procName, str(exc)) msg += "\n\tworkid=" + str(workid) msg += "\n\ttask=" + str(task['tm_taskname']) msg += "\n" + str(traceback.format_exc()) finally: if msg: try: logger.info("Uploading error message to REST: %s", msg) server = HTTPRequests(resthost, WORKER_CONFIG.TaskWorker.cmscert, WORKER_CONFIG.TaskWorker.cmskey, retry=20, logger=logger) truncMsg = truncateError(msg) configreq = { 'workflow': task['tm_taskname'], 'status': failstatus, 'subresource': 'failure', #limit the message to 7500 chars, which means no more than 10000 once encoded. That's the limit in the REST 'failure': b64encode(truncMsg) } server.post(resturi, data=urllib.urlencode(configreq)) logger.info( "Error message successfully uploaded to the REST") except HTTPException as hte: logger.warning( "Cannot upload failure message to the REST for workflow %s. HTTP headers follows:", task['tm_taskname']) logger.error(hte.headers) except Exception as exc: #pylint: disable=broad-except logger.warning( "Cannot upload failure message to the REST for workflow %s.\nReason: %s", task['tm_taskname'], exc) logger.exception('Traceback follows:') t1 = time.time() logger.debug("%s: ...work on %s completed in %d seconds: %s", procName, task['tm_taskname'], t1 - t0, outputs) try: out, _, _ = executeCommand( "ps u -p %s | awk '{sum=sum+$6}; END {print sum/1024}'" % os.getpid()) msg = "RSS after finishing %s: %s MB" % (task['tm_taskname'], out.strip()) logger.debug(msg) except: logger.exception("Problem getting worker RSS:") removeTaskLogHandler(logger, taskhandler) results.put({'workid': workid, 'out': outputs})
def formatOutput(self, task, requestname, datasetfiles, locations, tempDir): """ Receives as input the result of the data location discovery operations and fill up the WMCore objects. """ self.logger.debug(" Formatting data discovery output ") # TEMPORARY pnn_psn_map = {} sbj = SiteDBJSON({ "key": self.config.TaskWorker.cmskey, "cert": self.config.TaskWorker.cmscert }) wmfiles = [] event_counter = 0 lumi_counter = 0 uniquelumis = set() datasetLumis = {} ## Loop over the sorted list of files. for lfn, infos in datasetfiles.iteritems(): ## Skip the file if the block has not been found or has no locations. if not infos['BlockName'] in locations or not locations[ infos['BlockName']]: self.logger.warning( "Skipping %s because its block (%s) has no locations" % (lfn, infos['BlockName'])) continue ## Skip the file if it is not in VALID state. if not infos.get('ValidFile', True): self.logger.warning("Skipping invalid file %s" % lfn) continue if task['tm_use_parent'] == 1 and len(infos['Parents']) == 0: raise TaskWorkerException( "The CRAB3 server backend refuses to submit jobs to the Grid scheduler\n" + "because you specified useParents=True but some your files have no" + "parents.\nExample: " + lfn) ## Create a WMCore File object. try: size = infos['FileSize'] checksums = { 'Checksum': infos['Checksum'], 'Adler32': infos['Adler32'], 'Md5': infos['Md5'] } except: #This is so that the task worker does not crash if an old version of WMCore is used (the interface of an API suddenly changed). # We may want to remove the try/except and the following two lines eventually, but keeping them for the moment so other devels won't be affected #See this WMCore commit: https://github.com/dmwm/WMCore/commit/2afc01ae571390f5fa009dd258be757adac89c28#diff-374b7a6640288184175057234e393e1cL204 size = infos['Size'] checksums = infos['Checksums'] wmfile = File(lfn=lfn, events=infos['NumberOfEvents'], size=size, checksums=checksums, parents=infos['Parents']) wmfile['block'] = infos['BlockName'] wmfile['locations'] = [] for pnn in locations[infos['BlockName']]: if pnn and pnn not in pnn_psn_map: self.logger.debug("Translating PNN %s" % pnn) try: pnn_psn_map[pnn] = sbj.PNNtoPSN(pnn) except KeyError: self.logger.error( "Impossible translating %s to a CMS name through SiteDB" % pnn) pnn_psn_map[pnn] = '' except httplib.HTTPException as ex: self.logger.error("Couldn't map SE to site: %s" % pnn) print("Couldn't map SE to site: %s" % pnn) print("got problem: %s" % ex) print("got another problem: %s" % ex.__dict__) if pnn and pnn in pnn_psn_map: if isinstance(pnn_psn_map[pnn], list): wmfile['locations'].extend(pnn_psn_map[pnn]) else: wmfile['locations'].append(pnn_psn_map[pnn]) wmfile['workflow'] = requestname event_counter += infos['NumberOfEvents'] for run, lumis in infos['Lumis'].iteritems(): datasetLumis.setdefault(run, []).extend(lumis) wmfile.addRun(Run(run, *lumis)) for lumi in lumis: uniquelumis.add((run, lumi)) lumi_counter += len(lumis) wmfiles.append(wmfile) uniquelumis = len(uniquelumis) self.logger.debug('Tot events found: %d' % event_counter) self.logger.debug('Tot lumis found: %d' % uniquelumis) self.logger.debug('Duplicate lumis found: %d' % (lumi_counter - uniquelumis)) self.logger.debug('Tot files found: %d' % len(wmfiles)) self.logger.debug( "Starting to create compact lumilists for input dataset") datasetLumiList = LumiList(runsAndLumis=datasetLumis) datasetLumis = datasetLumiList.getCompactList() datasetDuplicateLumis = datasetLumiList.getDuplicates().getCompactList( ) self.logger.debug( "Finished to create compact lumilists for input dataset") with open(os.path.join(tempDir, "input_dataset_lumis.json"), "w") as fd: json.dump(datasetLumis, fd) with open(os.path.join(tempDir, "input_dataset_duplicate_lumis.json"), "w") as fd: json.dump(datasetDuplicateLumis, fd) return Result(task=task, result=Fileset(name='FilesToSplit', files=set(wmfiles)))
def processWorkerLoop(inputs, results, resthost, resturi, procnum, logger, logsDir): procName = "Process-%s" % procnum while True: try: ## Get (and remove) an item from the input queue. If the queue is empty, wait ## until an item is available. workid, work, task, failstatus, inputargs = inputs.get() if work == 'STOP': break taskhandler = addTaskLogHandler(logger, task['tm_username'], task['tm_taskname'], logsDir) except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logger.error(crashMessage) break outputs = None t0 = time.time() logger.debug("%s: Starting %s on %s", procName, str(work), task['tm_taskname']) try: msg = None outputs = work(resthost, resturi, WORKER_CONFIG, task, procnum, inputargs) except TapeDatasetException as tde: outputs = Result(task=task, err=str(tde)) except WorkerHandlerException as we: outputs = Result(task=task, err=str(we)) msg = str(we) except Exception as exc: #pylint: disable=broad-except outputs = Result(task=task, err=str(exc)) msg = "%s: I just had a failure for %s" % (procName, str(exc)) msg += "\n\tworkid=" + str(workid) msg += "\n\ttask=" + str(task['tm_taskname']) msg += "\n" + str(traceback.format_exc()) finally: if msg: server = HTTPRequests(resthost, WORKER_CONFIG.TaskWorker.cmscert, WORKER_CONFIG.TaskWorker.cmskey, retry=20, logger=logger) failTask(task['tm_taskname'], server, resturi, msg, logger, failstatus) t1 = time.time() logger.debug("%s: ...work on %s completed in %d seconds: %s", procName, task['tm_taskname'], t1 - t0, outputs) try: out, _, _ = executeCommand( "ps u -p %s | awk '{sum=sum+$6}; END {print sum/1024}'" % os.getpid()) msg = "RSS after finishing %s: %s MB" % (task['tm_taskname'], out.strip()) logger.debug(msg) except: logger.exception("Problem getting worker RSS:") removeTaskLogHandler(logger, taskhandler) results.put({'workid': workid, 'out': outputs})
def formatOutput(self, task, requestname, datasetfiles, locations, tempDir): """ Receives as input the result of the data location discovery operations and fill up the WMCore objects. """ self.logger.debug(" Formatting data discovery output ") wmfiles = [] event_counter = 0 lumi_counter = 0 uniquelumis = set() datasetLumis = {} blocksWithNoLocations = set() ## Loop over the sorted list of files. configDict = { "cacheduration": 1, "pycurl": True } # cache duration is in hours with tempSetLogLevel(logger=self.logger, level=logging.ERROR): resourceCatalog = CRIC(logger=self.logger, configDict=configDict) # can't affort one message from CRIC per file, unless critical ! with tempSetLogLevel(logger=self.logger, level=logging.ERROR): for lfn, infos in datasetfiles.iteritems(): ## Skip the file if it is not in VALID state. if not infos.get('ValidFile', True): self.logger.warning("Skipping invalid file %s", lfn) continue ## Skip the file if the block has not been found or has no locations. if not infos['BlockName'] in locations or not locations[ infos['BlockName']]: self.logger.warning( "Skipping %s because its block (%s) has no locations", lfn, infos['BlockName']) blocksWithNoLocations.add(infos['BlockName']) continue if task['tm_use_parent'] == 1 and len(infos['Parents']) == 0: self.logger.warning( "Skipping %s because it has no parents") continue ## Create a WMCore File object. size = infos['FileSize'] checksums = { 'Checksum': infos['Checksum'], 'Adler32': infos['Adler32'], 'Md5': infos['Md5'] } wmfile = File(lfn=lfn, events=infos['NumberOfEvents'], size=size, checksums=checksums, parents=infos['Parents']) wmfile['block'] = infos['BlockName'] try: wmfile['locations'] = resourceCatalog.PNNstoPSNs( locations[wmfile['block']]) except Exception as ex: self.logger.error( "Impossible translating %s to a CMS name through CMS Resource Catalog", locations[wmfile['block']]) self.logger.error("got this exception:\n %s", ex) raise wmfile['workflow'] = requestname event_counter += infos['NumberOfEvents'] for run, lumis in infos['Lumis'].iteritems(): datasetLumis.setdefault(run, []).extend(lumis) wmfile.addRun(Run(run, *lumis)) for lumi in lumis: uniquelumis.add((run, lumi)) lumi_counter += len(lumis) wmfiles.append(wmfile) if blocksWithNoLocations: msg = "%d blocks will be skipped because are not completely replicated on DISK: %s" % ( len(blocksWithNoLocations), list(blocksWithNoLocations)) self.logger.warning(msg) self.uploadWarning(msg, task['user_proxy'], task['tm_taskname']) uniquelumis = len(uniquelumis) self.logger.debug('Tot events found: %d', event_counter) self.logger.debug('Tot lumis found: %d', uniquelumis) self.logger.debug('Duplicate lumis found: %d', (lumi_counter - uniquelumis)) self.logger.debug('Tot files found: %d', len(wmfiles)) self.logger.debug( "Starting to create compact lumilists for input dataset") datasetLumiList = LumiList(runsAndLumis=datasetLumis) datasetLumis = datasetLumiList.getCompactList() datasetDuplicateLumis = datasetLumiList.getDuplicates().getCompactList( ) self.logger.debug( "Finished to create compact lumilists for input dataset") with open(os.path.join(tempDir, "input_dataset_lumis.json"), "w") as fd: json.dump(datasetLumis, fd) with open(os.path.join(tempDir, "input_dataset_duplicate_lumis.json"), "w") as fd: json.dump(datasetDuplicateLumis, fd) return Result(task=task, result=Fileset(name='FilesToSplit', files=set(wmfiles)))
def processWorker(inputs, results, resthost, resturi, procnum): """Wait for an reference to appear in the input queue, call the referenced object and write the output in the output queue. :arg Queue inputs: the queue where the inputs are shared by the master :arg Queue results: the queue where this method writes the output :return: default returning zero, but not really needed.""" logger = setProcessLogger(str(procnum)) logger.info("Process %s is starting. PID %s", procnum, os.getpid()) procName = "Process-%s" % procnum while True: try: ## Get (and remove) an item from the input queue. If the queue is empty, wait ## until an item is available. workid, work, task, failstatus, inputargs = inputs.get() except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logger.error(crashMessage) break if work == 'STOP': break outputs = None t0 = time.time() logger.debug("%s: Starting %s on %s" % (procName, str(work), task['tm_taskname'])) try: msg = None outputs = work(resthost, resturi, WORKER_CONFIG, task, procnum, inputargs) except WorkerHandlerException as we: outputs = Result(task=task, err=str(we)) msg = str(we) except Exception as exc: outputs = Result(task=task, err=str(exc)) msg = "%s: I just had a failure for %s" % (procName, str(exc)) msg += "\n\tworkid=" + str(workid) msg += "\n\ttask=" + str(task['tm_taskname']) msg += "\n" + str(traceback.format_exc()) finally: if msg: try: logger.info("Uploading error message to REST: %s" % msg) server = HTTPRequests(resthost, WORKER_CONFIG.TaskWorker.cmscert, WORKER_CONFIG.TaskWorker.cmskey, retry=2) truncMsg = truncateError(msg) configreq = { 'workflow': task['tm_taskname'], 'status': failstatus, 'subresource': 'failure', #limit the message to 7500 chars, which means no more than 10000 once encoded. That's the limit in the REST 'failure': b64encode(truncMsg) } server.post(resturi, data=urllib.urlencode(configreq)) logger.info( "Error message successfully uploaded to the REST") except HTTPException as hte: logger.warning( "Cannot upload failure message to the REST for workflow %s. HTTP headers follows:" % task['tm_taskname']) logger.error(hte.headers) except Exception as exc: logger.warning( "Cannot upload failure message to the REST for workflow %s.\nReason: %s" % (task['tm_taskname'], exc)) logger.exception('Traceback follows:') t1 = time.time() logger.debug("%s: ...work on %s completed in %d seconds: %s" % (procName, task['tm_taskname'], t1 - t0, outputs)) results.put({'workid': workid, 'out': outputs}) logger.debug("Slave %s exiting." % procnum) return 0
def formatOutput(self, task, requestname, datasetfiles, locations, tempDir): """ Receives as input the result of the data location discovery operations and fill up the WMCore objects. """ self.logger.debug(" Formatting data discovery output ") wmfiles = [] event_counter = 0 lumi_counter = 0 uniquelumis = set() datasetLumis = {} blocksWithNoLocations = set() ## Loop over the sorted list of files. configDict = { "cacheduration": 1, "pycurl": True } # cache duration is in hours with tempSetLogLevel(logger=self.logger, level=logging.ERROR): resourceCatalog = CRIC(logger=self.logger, configDict=configDict) # can't affort one message from CRIC per file, unless critical ! with tempSetLogLevel(logger=self.logger, level=logging.ERROR): for lfn, infos in datasetfiles.iteritems(): ## Skip the file if it is not in VALID state. if not infos.get('ValidFile', True): self.logger.warning("Skipping invalid file %s", lfn) continue ## Skip the file if the block has not been found or has no locations. if not infos['BlockName'] in locations or not locations[ infos['BlockName']]: self.logger.warning( "Skipping %s because its block (%s) has no locations", lfn, infos['BlockName']) blocksWithNoLocations.add(infos['BlockName']) continue if task['tm_use_parent'] == 1 and len(infos['Parents']) == 0: raise TaskWorkerException( "The CRAB3 server backend refuses to submit jobs to the Grid scheduler\n" + "because you specified useParents=True but some your files have no" + "parents.\nExample: " + lfn) ## Create a WMCore File object. try: size = infos['FileSize'] checksums = { 'Checksum': infos['Checksum'], 'Adler32': infos['Adler32'], 'Md5': infos['Md5'] } except: #This is so that the task worker does not crash if an old version of WMCore is used (the interface of an API suddenly changed). # We may want to remove the try/except and the following two lines eventually, but keeping them for the moment so other devels won't be affected #See this WMCore commit: https://github.com/dmwm/WMCore/commit/2afc01ae571390f5fa009dd258be757adac89c28#diff-374b7a6640288184175057234e393e1cL204 size = infos['Size'] checksums = infos['Checksums'] wmfile = File(lfn=lfn, events=infos['NumberOfEvents'], size=size, checksums=checksums, parents=infos['Parents']) wmfile['block'] = infos['BlockName'] try: wmfile['locations'] = resourceCatalog.PNNstoPSNs( locations[wmfile['block']]) except Exception as ex: self.logger.error( "Impossible translating %s to a CMS name through CMS Resource Catalog", locations[wmfile['block']]) self.logger.error("got this exception:\n %s", ex) raise wmfile['workflow'] = requestname event_counter += infos['NumberOfEvents'] for run, lumis in infos['Lumis'].iteritems(): datasetLumis.setdefault(run, []).extend(lumis) wmfile.addRun(Run(run, *lumis)) for lumi in lumis: uniquelumis.add((run, lumi)) lumi_counter += len(lumis) wmfiles.append(wmfile) if blocksWithNoLocations: msg = "%d blocks will be skipped because are not completely replicated on DISK: %s" % ( len(blocksWithNoLocations), list(blocksWithNoLocations)) self.logger.warning(msg) self.uploadWarning(msg, task['user_proxy'], task['tm_taskname']) uniquelumis = len(uniquelumis) self.logger.debug('Tot events found: %d', event_counter) self.logger.debug('Tot lumis found: %d', uniquelumis) self.logger.debug('Duplicate lumis found: %d', (lumi_counter - uniquelumis)) self.logger.debug('Tot files found: %d', len(wmfiles)) self.logger.debug( "Starting to create compact lumilists for input dataset") datasetLumiList = LumiList(runsAndLumis=datasetLumis) datasetLumis = datasetLumiList.getCompactList() datasetDuplicateLumis = datasetLumiList.getDuplicates().getCompactList( ) self.logger.debug( "Finished to create compact lumilists for input dataset") with open(os.path.join(tempDir, "input_dataset_lumis.json"), "w") as fd: json.dump(datasetLumis, fd) with open(os.path.join(tempDir, "input_dataset_duplicate_lumis.json"), "w") as fd: json.dump(datasetDuplicateLumis, fd) return Result(task=task, result=Fileset(name='FilesToSplit', files=set(wmfiles)))
def execute(self, *args, **kwargs): wmwork = Workflow(name=kwargs['task']['tm_taskname']) wmsubs = Subscription( fileset=args[0], workflow=wmwork, split_algo=kwargs['task']['tm_split_algo'], type=self.jobtypeMapper[kwargs['task']['tm_job_type']]) splitter = SplitterFactory() jobfactory = splitter(subscription=wmsubs) splitparam = kwargs['task']['tm_split_args'] splitparam['algorithm'] = kwargs['task']['tm_split_algo'] if kwargs['task']['tm_job_type'] == 'Analysis': if kwargs['task']['tm_split_algo'] == 'FileBased': splitparam['total_files'] = kwargs['task']['tm_totalunits'] elif kwargs['task']['tm_split_algo'] == 'LumiBased': splitparam['total_lumis'] = kwargs['task']['tm_totalunits'] elif kwargs['task']['tm_split_algo'] == 'EventAwareLumiBased': splitparam['total_events'] = kwargs['task']['tm_totalunits'] elif kwargs['task']['tm_job_type'] == 'PrivateMC': if 'tm_events_per_lumi' in kwargs['task'] and kwargs['task'][ 'tm_events_per_lumi']: splitparam['events_per_lumi'] = kwargs['task'][ 'tm_events_per_lumi'] if 'tm_generator' in kwargs['task'] and kwargs['task'][ 'tm_generator'] == 'lhe': splitparam['lheInputFiles'] = True splitparam['applyLumiCorrection'] = True factory = jobfactory(**splitparam) numJobs = sum([len(jobgroup.getJobs()) for jobgroup in factory]) maxJobs = getattr(self.config.TaskWorker, 'maxJobsPerTask', 10000) if numJobs == 0: msg = "The CRAB3 server backend could not submit any job to the Grid scheduler:" msg += " Splitting task %s" % (kwargs['task']['tm_taskname']) if kwargs['task']['tm_input_dataset']: msg += " on dataset %s" % (kwargs['task']['tm_input_dataset']) msg += " with %s method does not generate any job" % ( kwargs['task']['tm_split_algo']) raise TaskWorkerException(msg) elif numJobs > maxJobs: raise TaskWorkerException( "The splitting on your task generated %s jobs. The maximum number of jobs in each task is %s" % (numJobs, maxJobs)) #printing duplicated lumis if any lumiChecker = getattr(jobfactory, 'lumiChecker', None) if lumiChecker and lumiChecker.splitLumiFiles: self.logger.warning( "The input dataset contains the following duplicated lumis %s" % lumiChecker.splitLumiFiles.keys()) #TODO use self.uploadWarning try: userServer = HTTPRequests(self.server['host'], kwargs['task']['user_proxy'], kwargs['task']['user_proxy']) configreq = { 'subresource': 'addwarning', 'workflow': kwargs['task']['tm_taskname'], 'warning': b64encode( 'The CRAB3 server backend detected lumis split across files in the input dataset.' ' Will apply the necessary corrections in the splitting algorithms. You can ignore this message.' ) } userServer.post(self.restURInoAPI + '/task', data=urllib.urlencode(configreq)) except HTTPException as hte: self.logger.error(hte.headers) self.logger.warning( "Cannot add warning to REST after finding duplicates") return Result(task=kwargs['task'], result=factory)