def executeInternal(self, *args): """The executeInternal method return 4 if the "completion" threshold is not reached, 0 otherwise""" self.stage = args[0] self.completion = int(args[1]) self.prefix = args[2] self.setupLog() self.statusCacheInfo = { } #Will be filled with the status from the status cache self.readJobStatus() completed = set(self.completedJobs(stage=self.stage)) if len(completed) < self.completion: return 4 self.readProcessedJobs() unprocessed = completed - self.processedJobs estimates = copy.copy(unprocessed) self.logger.info("jobs remaining to process: %s", ", ".join(sorted(unprocessed))) if self.stage == 'tail' and len(estimates - set(self.failedJobs)) == 0: estimates = set( self.completedJobs(stage='processing', processFailed=False)) self.logger.info("jobs remaining to process: %s", ", ".join(sorted(unprocessed))) # The TaskWorker saves some files that now we are gonna read with open('datadiscovery.pkl', 'rb') as fd: dataset = pickle.load(fd) #Output from the discovery process with open('taskinformation.pkl', 'rb') as fd: task = pickle.load( fd ) #A dictionary containing information about the task as in the Oracle DB with open('taskworkerconfig.pkl', 'rb') as fd: config = pickle.load(fd) #Task worker configuration # need to use user proxy as credential for talking with cmsweb config.TaskWorker.cmscert = os.environ.get('X509_USER_PROXY') config.TaskWorker.cmskey = os.environ.get('X509_USER_PROXY') config.TaskWorker.envForCMSWEB = newX509env( X509_USER_CERT=config.TaskWorker.cmscert, X509_USER_KEY=config.TaskWorker.cmskey) # need to get username from classAd to setup for Rucio access task_ad = classad.parseOne(open(os.environ['_CONDOR_JOB_AD'])) username = task_ad['CRAB_UserHN'] config.Services.Rucio_account = username # need the global black list config.TaskWorker.scratchDir = './scratchdir' if not os.path.exists(config.TaskWorker.scratchDir): os.makedirs(config.TaskWorker.scratchDir) from TaskWorker.Actions.Recurring.BanDestinationSites import CRAB3BanDestinationSites banSites = CRAB3BanDestinationSites(config, self.logger) with config.TaskWorker.envForCMSWEB: banSites.execute() # Read the automatic_splitting/throughputs/0-N files where the PJ # saved the EventThroughput # (report['steps']['cmsRun']['performance']['cpu']['EventThroughput']) # and the average size of the output per event sumEventsThr = 0 sumEventsSize = 0 count = 0 for jid in estimates: if jid in self.failedJobs: continue fn = "automatic_splitting/throughputs/{0}".format(jid) with open(fn) as fd: throughput, eventsize = json.load(fd) sumEventsThr += throughput sumEventsSize += eventsize count += 1 eventsThr = sumEventsThr / count eventsSize = sumEventsSize / count self.logger.info("average throughput for %s jobs: %s evt/s", count, eventsThr) self.logger.info("average eventsize for %s jobs: %s bytes", count, eventsSize) maxSize = getattr(config.TaskWorker, 'automaticOutputSizeMaximum', 5 * 1000**3) maxEvents = (maxSize / eventsSize) if eventsSize > 0 else 0 runtime = task['tm_split_args'].get('minutes_per_job', -1) if self.stage == "processing": # Build in a 33% error margin in the runtime to not create too # many tails. This essentially moves the peak to lower # runtimes and cuts off less of the job distribution tail. target = int(0.75 * runtime) elif self.stage == 'tail': target = int( max( getattr(config.TaskWorker, 'automaticTailRuntimeMinimumMins', 45), getattr(config.TaskWorker, 'automaticTailRuntimeFraction', 0.2) * runtime)) # `target` is in minutes, `eventsThr` is in events/second! events = int(target * eventsThr * 60) if events > maxEvents and maxEvents > 0: self.logger.info( "reduced the target event count from %s to %s to obey output size", events, maxEvents) events = int(maxEvents) splitTask = dict(task) splitTask['tm_split_algo'] = 'EventAwareLumiBased' splitTask['tm_split_args']['events_per_job'] = events if self.stage == 'tail' and not self.adjustLumisForCompletion( splitTask, unprocessed): self.logger.info("nothing to process for completion") self.saveProcessedJobs(unprocessed) return 0 # Disable retries for processing: every lumi is attempted to be # processed once in processing, thrice in the tails -> four times. # That should be enough "retries" # # See note in DagmanCreator about getting this from the Task DB if self.stage == "processing": config.TaskWorker.numAutomJobRetries = 0 try: splitter = Splitter(config, crabserver=None) split_result = splitter.execute(dataset, task=splitTask) self.logger.info("Splitting results:") for g in split_result.result[0]: msg = "Created jobgroup with length {0}".format( len(g.getJobs())) self.logger.info(msg) except TaskWorkerException as e: retmsg = "Splitting failed with:\n{0}".format(e) self.logger.error(retmsg) # self.set_dashboard_state('FAILED') return 1 try: parent = self.prefix if self.stage == 'tail' else None rucioClient = getNativeRucioClient(config=config, logger=self.logger) creator = DagmanCreator(config, crabserver=None, rucioClient=rucioClient) with config.TaskWorker.envForCMSWEB: creator.createSubdag(split_result.result, task=task, parent=parent, stage=self.stage) self.submitSubdag( 'RunJobs{0}.subdag'.format(self.prefix), getattr(config.TaskWorker, 'maxIdle', MAX_IDLE_JOBS), getattr(config.TaskWorker, 'maxPost', MAX_POST_JOBS), self.stage) except TaskWorkerException as e: retmsg = "DAG creation failed with:\n{0}".format(e) self.logger.error(retmsg) # self.set_dashboard_state('FAILED') return 1 self.saveProcessedJobs(unprocessed) return 0
def requestTapeRecall(self, blockList=[], system='Dynamo', msgHead=''): # pylint: disable=W0102 """ :param blockList: a list of blocks to recall from Tape to Disk :param system: a string identifying the DDM system to use 'Dynamo' or 'Rucio' or 'None' :param msgHead: a string with the initial part of a message to be used for exceptions :return: nothing: Since data on tape means no submission possible, this function will always raise a TaskWorkerException to stop the action flow. The exception message contains details and an attempt is done to upload it to TaskDB so that crab status can report it """ msg = msgHead if system == 'Rucio': # need to use crab_tape_recall Rucio account to create containers and create rules tapeRecallConfig = copy.copy(self.config) tapeRecallConfig.Services.Rucio_account = 'crab_tape_recall' rucioClient = getNativeRucioClient(tapeRecallConfig, self.logger) # pylint: disable=redefined-outer-name # turn input CMS blocks into Rucio dids in cms scope dids = [{'scope': 'cms', 'name': block} for block in blockList] # prepare container /TapeRecall/taskname/USER in the service scope myScope = 'user.crab_tape_recall' containerName = '/TapeRecall/%s/USER' % self.taskName.replace(':', '.') containerDid = {'scope':myScope, 'name':containerName} self.logger.info("Create RUcio container %s", containerName) try: rucioClient.add_container(myScope, containerName) except DataIdentifierAlreadyExists: self.logger.debug("Container name already exists in Rucio. Keep going") except Exception as ex: msg += "Rucio exception creating container: %s" % (str(ex)) raise TaskWorkerException(msg) try: rucioClient.attach_dids(myScope, containerName, dids) except DuplicateContent: self.logger.debug("Some dids are already in this container. Keep going") except Exception as ex: msg += "Rucio exception adding blocks to container: %s" % (str(ex)) raise TaskWorkerException(msg) self.logger.info("Rucio container %s:%s created with %d blocks", myScope, containerName, len(blockList)) # Compute size of recall request sizeToRecall = 0 for block in blockList: replicas = rucioClient.list_dataset_replicas('cms', block) blockBytes = replicas.next()['bytes'] # pick first replica for each block, they better all have same size sizeToRecall += blockBytes TBtoRecall = sizeToRecall // 1e12 if TBtoRecall > 0: self.logger.info("Total size of data to recall : %d TBytes", TBtoRecall) else: self.logger.info("Total size of data to recall : %d GBytes", sizeToRecall/1e9) if TBtoRecall > 30.: grouping = 'DATASET' # Rucio DATASET i.e. CMS block ! self.logger.info("Will scatter blocks on multiple sites") else: grouping = 'ALL' self.logger.info("Will place all blocks at a single site") # create rule RSE_EXPRESSION = 'ddm_quota>0&(tier=1|tier=2)&rse_type=DISK' #RSE_EXPRESSION = 'T3_IT_Trieste' # for testing WEIGHT = 'ddm_quota' #WEIGHT = None # for testing LIFETIME = 14 * 24 * 3600 # 14 days ASK_APPROVAL = False #ASK_APPROVAL = True # for testing ACCOUNT = 'crab_tape_recall' copies = 1 try: ruleId = rucioClient.add_replication_rule(dids=[containerDid], copies=copies, rse_expression=RSE_EXPRESSION, grouping=grouping, weight=WEIGHT, lifetime=LIFETIME, account=ACCOUNT, activity='Analysis Input', comment='Staged from tape for %s' % self.username, ask_approval=ASK_APPROVAL, asynchronous=True, ) except DuplicateRule as ex: # handle "A duplicate rule for this account, did, rse_expression, copies already exists" # which should only happen when testing, since container name is unique like task name, anyhow... self.logger.debug("A duplicate rule for this account, did, rse_expression, copies already exists. Use that") # find the existing rule id ruleId = rucioClient.list_did_rules(myScope, containerName) except (InsufficientTargetRSEs, InsufficientAccountLimit, FullStorage) as ex: msg = "Not enough global quota to issue a tape recall request. Rucio exception:\n%s" % str(ex) raise TaskWorkerException(msg) except Exception as ex: msg += "Rucio exception creating rule: %s" % str(ex) raise TaskWorkerException(msg) ruleId = str(ruleId[0]) # from list to singleId and remove unicode msg += "\nA disk replica has been requested to Rucio (rule ID: %s )" % ruleId msg += "\nyou can check progress via either of the following two commands:" msg += "\n rucio rule-info %s" % ruleId msg += "\n rucio list-rules %s:%s" % (myScope, containerName) automaticTapeRecallIsImplemented = True if automaticTapeRecallIsImplemented: tapeRecallStatus = 'TAPERECALL' else: tapeRecallStatus = 'SUBMITFAILED' configreq = {'workflow': self.taskName, 'taskstatus': tapeRecallStatus, 'ddmreqid': ruleId, 'subresource': 'addddmreqid', } try: tapeRecallStatusSet = self.crabserver.post(api='task', data=urllib.urlencode(configreq)) except HTTPException as hte: self.logger.exception(hte) msg = "HTTP Error while contacting the REST Interface %s:\n%s" % ( self.config.TaskWorker.restHost, str(hte)) msg += "\nStoring of %s status and ruleId (%s) failed for task %s" % ( tapeRecallStatus, ruleId, self.taskName) msg += "\nHTTP Headers are: %s" % hte.headers raise TaskWorkerException(msg, retry=True) if tapeRecallStatusSet[2] == "OK": self.logger.info("Status for task %s set to '%s'", self.taskName, tapeRecallStatus) if automaticTapeRecallIsImplemented: msg += "\nThis task will be automatically submitted as soon as the stage-out is completed." self.uploadWarning(msg, self.userproxy, self.taskName) raise TapeDatasetException(msg) # fall here if could not setup for automatic submission after recall msg += "\nPlease monitor recall progress via Rucio or DAS and try again once data are on disk." raise TaskWorkerException(msg) if system == 'None': msg += '\nIt is not possible to request a recall from tape.' msg += "\nPlease, check DAS (https://cmsweb.cern.ch/das) and make sure the dataset is accessible on DISK." raise TaskWorkerException(msg) if system == 'Dynamo': raise NotImplementedError
def _execute(self, config, task): # setup logger if not self.logger: self.logger = logging.getLogger(__name__) handler = logging.StreamHandler(sys.stdout) # pylint: disable=redefined-outer-name formatter = logging.Formatter( "%(asctime)s:%(levelname)s:%(module)s %(message)s") # pylint: disable=redefined-outer-name handler.setFormatter(formatter) self.logger.addHandler(handler) self.logger.setLevel(logging.DEBUG) else: # do not use BaseRecurringAction logger but create a new logger # which writes to config.TaskWorker.logsDir/taks/recurring/TapeRecallStatus_YYMMDD-HHMM.log self.logger = logging.getLogger('TapeRecallStatus') logDir = config.TaskWorker.logsDir + '/tasks/recurring/' if not os.path.exists(logDir): os.makedirs(logDir) timeStamp = time.strftime('%y%m%d-%H%M', time.localtime()) logFile = 'TapeRecallStatus_' + timeStamp + '.log' handler = logging.FileHandler(logDir + logFile) formatter = logging.Formatter( '%(asctime)s:%(levelname)s:%(module)s:%(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) mw = MasterWorker(config, logWarning=False, logDebug=False, sequential=True, console=False, name='masterForTapeRecall') tapeRecallStatus = 'TAPERECALL' self.logger.info("Retrieving %s tasks", tapeRecallStatus) recallingTasks = mw.getWork(limit=999999, getstatus=tapeRecallStatus, ignoreTWName=True) if not recallingTasks: self.logger.info("No %s task retrieved.", tapeRecallStatus) return self.logger.info("Retrieved a total of %d %s tasks", len(recallingTasks), tapeRecallStatus) crabserver = mw.crabserver for recallingTask in recallingTasks: taskName = recallingTask['tm_taskname'] self.logger.info("Working on task %s", taskName) reqId = recallingTask['tm_DDM_reqid'] if not reqId: self.logger.debug( "tm_DDM_reqid' is not defined for task %s, skipping such task", taskName) continue else: msg = "Task points to Rucio RuleId: %s " % reqId self.logger.info(msg) if (time.time() - getTimeFromTaskname( str(taskName))) > MAX_DAYS_FOR_TAPERECALL * 24 * 60 * 60: self.logger.info( "Task %s is older than %d days, setting its status to FAILED", taskName, MAX_DAYS_FOR_TAPERECALL) msg = "The disk replica request (ID: %s) for the input dataset did not complete in %d days." % ( reqId, MAX_DAYS_FOR_TAPERECALL) failTask(taskName, crabserver, msg, self.logger, 'FAILED') continue if not 'S3' in recallingTask['tm_cache_url'].upper(): # when using old crabcache had to worry about sandbox purging after 3 days mpl = MyProxyLogon(config=config, crabserver=crabserver, myproxylen=self.pollingTime) user_proxy = True try: mpl.execute(task=recallingTask ) # this adds 'user_proxy' to recallingTask except TaskWorkerException as twe: user_proxy = False self.logger.exception(twe) # Make sure the task sandbox in the crabcache is not deleted until the tape recall is completed if user_proxy: self.refreshSandbox(recallingTask) # Retrieve status of recall request if not self.rucioClient: self.rucioClient = getNativeRucioClient(config=config, logger=self.logger) try: ddmRequest = self.rucioClient.get_replication_rule(reqId) except RuleNotFound: msg = "Rucio rule id %s not found. Please report to experts" % reqId self.logger.error(msg) if user_proxy: mpl.uploadWarning(msg, recallingTask['user_proxy'], taskName) if ddmRequest['state'] == 'OK': self.logger.info( "Request %s is completed, setting status of task %s to NEW", reqId, taskName) mw.updateWork(taskName, recallingTask['tm_task_command'], 'NEW') # Delete all task warnings (the tapeRecallStatus added a dataset warning which is no longer valid now) if user_proxy: mpl.deleteWarnings(recallingTask['user_proxy'], taskName) else: expiration = ddmRequest[ 'expires_at'] # this is a datetime.datetime object if expiration < datetime.datetime.now(): # give up waiting msg = ( "Replication request %s for task %s expired. Setting its status to FAILED" % (reqId, taskName)) self.logger.info(msg) failTask(taskName, crabserver, msg, self.logger, 'FAILED')
#config.TaskWorker.cmscert = os.environ["X509_USER_PROXY"] #config.TaskWorker.cmskey = os.environ["X509_USER_PROXY"] # will user service cert as defined for TW config.TaskWorker.cmscert = os.environ["X509_USER_CERT"] config.TaskWorker.cmskey = os.environ["X509_USER_KEY"] config.TaskWorker.envForCMSWEB = newX509env(X509_USER_CERT=config.TaskWorker.cmscert, X509_USER_KEY=config.TaskWorker.cmskey) config.TaskWorker.instance = 'prod' config.Services.Rucio_host = 'https://cms-rucio.cern.ch' config.Services.Rucio_account = 'crab_server' config.Services.Rucio_authUrl = 'https://cms-rucio-auth.cern.ch' config.Services.Rucio_caPath = '/etc/grid-security/certificates/' rucioClient = getNativeRucioClient(config=config, logger=logging.getLogger()) fileset = DBSDataDiscovery(config=config, rucioClient=rucioClient) fileset.execute(task={'tm_nonvalid_input_dataset': 'T', 'tm_use_parent': 0, 'user_proxy': 'None', 'tm_input_dataset': dbsDataset, 'tm_secondary_input_dataset': dbsSecondaryDataset, 'tm_taskname': 'pippo1', 'tm_username':config.Services.Rucio_account, 'tm_split_algo' : 'automatic', 'tm_split_args' : {'runs':[], 'lumis':[]}, 'tm_dbs_url': DBSUrl}, tempDir='') #=============================================================================== # Some interesting datasets for testing # dataset = '/DoubleMuon/Run2018B-PromptReco-v2/AOD' # on tape # dataset = '/DoubleMuon/Run2018B-02Apr2020-v1/NANOAOD' # isNano # dataset = '/DoubleMuon/Run2018B-17Sep2018-v1/MINIAOD' # parent of above NANOAOD (for secondaryDataset lookup) # dataset = '/MuonEG/Run2016B-07Aug17_ver2-v1/AOD' # no Nano on disk (at least atm) # dataset = '/MuonEG/Run2016B-v1/RAW' # on tape
def handleNewTask(resthost, dbInstance, config, task, procnum, *args, **kwargs): """Performs the injection of a new task :arg str resthost: the hostname where the rest interface is running :arg str dbInstance: the rest base url to contact :arg WMCore.Configuration config: input configuration :arg TaskWorker.DataObjects.Task task: the task to work on :arg int procnum: the process number taking care of the work :*args and *kwargs: extra parameters currently not defined :return: the handler.""" crabserver = CRABRest(resthost, config.TaskWorker.cmscert, config.TaskWorker.cmskey, retry=20, logger=logging.getLogger(str(procnum)), userAgent='CRABTaskWorker', version=__version__) crabserver.setDbInstance(dbInstance) handler = TaskHandler(task, procnum, crabserver, config, 'handleNewTask', createTempDir=True) rucioClient = getNativeRucioClient(config=config, logger=handler.logger) handler.addWork( MyProxyLogon(config=config, crabserver=crabserver, procnum=procnum, myproxylen=60 * 60 * 24)) handler.addWork( StageoutCheck(config=config, crabserver=crabserver, procnum=procnum, rucioClient=rucioClient)) if task['tm_job_type'] == 'Analysis': if task.get('tm_user_files'): handler.addWork( UserDataDiscovery(config=config, crabserver=crabserver, procnum=procnum)) else: handler.addWork( DBSDataDiscovery(config=config, crabserver=crabserver, procnum=procnum, rucioClient=rucioClient)) elif task['tm_job_type'] == 'PrivateMC': handler.addWork( MakeFakeFileSet(config=config, crabserver=crabserver, procnum=procnum)) handler.addWork( Splitter(config=config, crabserver=crabserver, procnum=procnum)) handler.addWork( DagmanCreator(config=config, crabserver=crabserver, procnum=procnum, rucioClient=rucioClient)) if task['tm_dry_run'] == 'T': handler.addWork( DryRunUploader(config=config, crabserver=crabserver, procnum=procnum)) else: handler.addWork( DagmanSubmitter(config=config, crabserver=crabserver, procnum=procnum)) return handler.actionWork(args, kwargs)