Example #1
0
    def executeInternal(self, *args):
        """The executeInternal method return 4 if the "completion" threshold is not reached, 0 otherwise"""
        self.stage = args[0]
        self.completion = int(args[1])
        self.prefix = args[2]

        self.setupLog()

        self.statusCacheInfo = {
        }  #Will be filled with the status from the status cache

        self.readJobStatus()
        completed = set(self.completedJobs(stage=self.stage))
        if len(completed) < self.completion:
            return 4

        self.readProcessedJobs()
        unprocessed = completed - self.processedJobs
        estimates = copy.copy(unprocessed)
        self.logger.info("jobs remaining to process: %s",
                         ", ".join(sorted(unprocessed)))
        if self.stage == 'tail' and len(estimates - set(self.failedJobs)) == 0:
            estimates = set(
                self.completedJobs(stage='processing', processFailed=False))
        self.logger.info("jobs remaining to process: %s",
                         ", ".join(sorted(unprocessed)))

        # The TaskWorker saves some files that now we are gonna read
        with open('datadiscovery.pkl', 'rb') as fd:
            dataset = pickle.load(fd)  #Output from the discovery process
        with open('taskinformation.pkl', 'rb') as fd:
            task = pickle.load(
                fd
            )  #A dictionary containing information about the task as in the Oracle DB
        with open('taskworkerconfig.pkl', 'rb') as fd:
            config = pickle.load(fd)  #Task worker configuration

        # need to use user proxy as credential for talking with cmsweb
        config.TaskWorker.cmscert = os.environ.get('X509_USER_PROXY')
        config.TaskWorker.cmskey = os.environ.get('X509_USER_PROXY')
        config.TaskWorker.envForCMSWEB = newX509env(
            X509_USER_CERT=config.TaskWorker.cmscert,
            X509_USER_KEY=config.TaskWorker.cmskey)

        # need to get username from classAd to setup for Rucio access
        task_ad = classad.parseOne(open(os.environ['_CONDOR_JOB_AD']))
        username = task_ad['CRAB_UserHN']
        config.Services.Rucio_account = username

        # need the global black list
        config.TaskWorker.scratchDir = './scratchdir'
        if not os.path.exists(config.TaskWorker.scratchDir):
            os.makedirs(config.TaskWorker.scratchDir)
        from TaskWorker.Actions.Recurring.BanDestinationSites import CRAB3BanDestinationSites
        banSites = CRAB3BanDestinationSites(config, self.logger)
        with config.TaskWorker.envForCMSWEB:
            banSites.execute()

        # Read the automatic_splitting/throughputs/0-N files where the PJ
        # saved the EventThroughput
        # (report['steps']['cmsRun']['performance']['cpu']['EventThroughput'])
        # and the average size of the output per event
        sumEventsThr = 0
        sumEventsSize = 0
        count = 0
        for jid in estimates:
            if jid in self.failedJobs:
                continue
            fn = "automatic_splitting/throughputs/{0}".format(jid)
            with open(fn) as fd:
                throughput, eventsize = json.load(fd)
                sumEventsThr += throughput
                sumEventsSize += eventsize
                count += 1
        eventsThr = sumEventsThr / count
        eventsSize = sumEventsSize / count

        self.logger.info("average throughput for %s jobs: %s evt/s", count,
                         eventsThr)
        self.logger.info("average eventsize for %s jobs: %s bytes", count,
                         eventsSize)

        maxSize = getattr(config.TaskWorker, 'automaticOutputSizeMaximum',
                          5 * 1000**3)
        maxEvents = (maxSize / eventsSize) if eventsSize > 0 else 0

        runtime = task['tm_split_args'].get('minutes_per_job', -1)
        if self.stage == "processing":
            # Build in a 33% error margin in the runtime to not create too
            # many tails. This essentially moves the peak to lower
            # runtimes and cuts off less of the job distribution tail.
            target = int(0.75 * runtime)
        elif self.stage == 'tail':
            target = int(
                max(
                    getattr(config.TaskWorker,
                            'automaticTailRuntimeMinimumMins', 45),
                    getattr(config.TaskWorker, 'automaticTailRuntimeFraction',
                            0.2) * runtime))
        # `target` is in minutes, `eventsThr` is in events/second!
        events = int(target * eventsThr * 60)
        if events > maxEvents and maxEvents > 0:
            self.logger.info(
                "reduced the target event count from %s to %s to obey output size",
                events, maxEvents)
            events = int(maxEvents)
        splitTask = dict(task)
        splitTask['tm_split_algo'] = 'EventAwareLumiBased'
        splitTask['tm_split_args']['events_per_job'] = events

        if self.stage == 'tail' and not self.adjustLumisForCompletion(
                splitTask, unprocessed):
            self.logger.info("nothing to process for completion")
            self.saveProcessedJobs(unprocessed)
            return 0

        # Disable retries for processing: every lumi is attempted to be
        # processed once in processing, thrice in the tails -> four times.
        # That should be enough "retries"
        #
        # See note in DagmanCreator about getting this from the Task DB
        if self.stage == "processing":
            config.TaskWorker.numAutomJobRetries = 0

        try:
            splitter = Splitter(config, crabserver=None)
            split_result = splitter.execute(dataset, task=splitTask)
            self.logger.info("Splitting results:")
            for g in split_result.result[0]:
                msg = "Created jobgroup with length {0}".format(
                    len(g.getJobs()))
                self.logger.info(msg)
        except TaskWorkerException as e:
            retmsg = "Splitting failed with:\n{0}".format(e)
            self.logger.error(retmsg)
            #            self.set_dashboard_state('FAILED')
            return 1
        try:
            parent = self.prefix if self.stage == 'tail' else None
            rucioClient = getNativeRucioClient(config=config,
                                               logger=self.logger)
            creator = DagmanCreator(config,
                                    crabserver=None,
                                    rucioClient=rucioClient)
            with config.TaskWorker.envForCMSWEB:
                creator.createSubdag(split_result.result,
                                     task=task,
                                     parent=parent,
                                     stage=self.stage)
            self.submitSubdag(
                'RunJobs{0}.subdag'.format(self.prefix),
                getattr(config.TaskWorker, 'maxIdle', MAX_IDLE_JOBS),
                getattr(config.TaskWorker, 'maxPost', MAX_POST_JOBS),
                self.stage)
        except TaskWorkerException as e:
            retmsg = "DAG creation failed with:\n{0}".format(e)
            self.logger.error(retmsg)
            #            self.set_dashboard_state('FAILED')
            return 1
        self.saveProcessedJobs(unprocessed)
        return 0
    def requestTapeRecall(self, blockList=[], system='Dynamo', msgHead=''):   # pylint: disable=W0102
        """
        :param blockList: a list of blocks to recall from Tape to Disk
        :param system: a string identifying the DDM system to use 'Dynamo' or 'Rucio' or 'None'
        :param msgHead: a string with the initial part of a message to be used for exceptions
        :return: nothing: Since data on tape means no submission possible, this function will
            always raise a TaskWorkerException to stop the action flow.
            The exception message contains details and an attempt is done to upload it to TaskDB
            so that crab status can report it
        """

        msg = msgHead
        if system == 'Rucio':
            # need to use crab_tape_recall Rucio account to create containers and create rules
            tapeRecallConfig = copy.copy(self.config)
            tapeRecallConfig.Services.Rucio_account = 'crab_tape_recall'
            rucioClient = getNativeRucioClient(tapeRecallConfig, self.logger) # pylint: disable=redefined-outer-name
            # turn input CMS blocks into Rucio dids in cms scope
            dids = [{'scope': 'cms', 'name': block} for block in blockList]
            # prepare container /TapeRecall/taskname/USER in the service scope
            myScope = 'user.crab_tape_recall'
            containerName = '/TapeRecall/%s/USER' % self.taskName.replace(':', '.')
            containerDid = {'scope':myScope, 'name':containerName}
            self.logger.info("Create RUcio container %s", containerName)
            try:
                rucioClient.add_container(myScope, containerName)
            except DataIdentifierAlreadyExists:
                self.logger.debug("Container name already exists in Rucio. Keep going")
            except Exception as ex:
                msg += "Rucio exception creating container: %s" %  (str(ex))
                raise TaskWorkerException(msg)
            try:
                rucioClient.attach_dids(myScope, containerName, dids)
            except DuplicateContent:
                self.logger.debug("Some dids are already in this container. Keep going")
            except Exception as ex:
                msg += "Rucio exception adding blocks to container: %s" %  (str(ex))
                raise TaskWorkerException(msg)
            self.logger.info("Rucio container %s:%s created with %d blocks", myScope, containerName, len(blockList))

            # Compute size of recall request
            sizeToRecall = 0
            for block in blockList:
                replicas = rucioClient.list_dataset_replicas('cms', block)
                blockBytes = replicas.next()['bytes']  # pick first replica for each block, they better all have same size
                sizeToRecall += blockBytes
            TBtoRecall = sizeToRecall // 1e12
            if TBtoRecall > 0:
                self.logger.info("Total size of data to recall : %d TBytes", TBtoRecall)
            else:
                self.logger.info("Total size of data to recall : %d GBytes", sizeToRecall/1e9)

            if TBtoRecall > 30.:
                grouping = 'DATASET'  # Rucio DATASET i.e. CMS block !
                self.logger.info("Will scatter blocks on multiple sites")
            else:
                grouping = 'ALL'
                self.logger.info("Will place all blocks at a single site")

            # create rule
            RSE_EXPRESSION = 'ddm_quota>0&(tier=1|tier=2)&rse_type=DISK'
            #RSE_EXPRESSION = 'T3_IT_Trieste' # for testing
            WEIGHT = 'ddm_quota'
            #WEIGHT = None # for testing
            LIFETIME = 14 * 24 * 3600  # 14 days
            ASK_APPROVAL = False
            #ASK_APPROVAL = True # for testing
            ACCOUNT = 'crab_tape_recall'
            copies = 1
            try:
                ruleId = rucioClient.add_replication_rule(dids=[containerDid],
                                                  copies=copies, rse_expression=RSE_EXPRESSION,
                                                  grouping=grouping,
                                                  weight=WEIGHT, lifetime=LIFETIME, account=ACCOUNT,
                                                  activity='Analysis Input',
                                                  comment='Staged from tape for %s' % self.username,
                                                  ask_approval=ASK_APPROVAL, asynchronous=True,
                                                  )
            except DuplicateRule as ex:
                # handle "A duplicate rule for this account, did, rse_expression, copies already exists"
                # which should only happen when testing, since container name is unique like task name, anyhow...
                self.logger.debug("A duplicate rule for this account, did, rse_expression, copies already exists. Use that")
                # find the existing rule id
                ruleId = rucioClient.list_did_rules(myScope, containerName)
            except (InsufficientTargetRSEs, InsufficientAccountLimit, FullStorage) as ex:
                msg = "Not enough global quota to issue a tape recall request. Rucio exception:\n%s" % str(ex)
                raise TaskWorkerException(msg)
            except Exception as ex:
                msg += "Rucio exception creating rule: %s" %  str(ex)
                raise TaskWorkerException(msg)
            ruleId = str(ruleId[0])  # from list to singleId and remove unicode

            msg += "\nA disk replica has been requested to Rucio (rule ID: %s )" % ruleId
            msg += "\nyou can check progress via either of the following two commands:"
            msg += "\n rucio rule-info %s" % ruleId
            msg += "\n rucio list-rules %s:%s" % (myScope, containerName)
            automaticTapeRecallIsImplemented = True
            if automaticTapeRecallIsImplemented:
                tapeRecallStatus = 'TAPERECALL'
            else:
                tapeRecallStatus = 'SUBMITFAILED'
            configreq = {'workflow': self.taskName,
                         'taskstatus': tapeRecallStatus,
                         'ddmreqid': ruleId,
                         'subresource': 'addddmreqid',
                         }
            try:
                tapeRecallStatusSet = self.crabserver.post(api='task', data=urllib.urlencode(configreq))
            except HTTPException as hte:
                self.logger.exception(hte)
                msg = "HTTP Error while contacting the REST Interface %s:\n%s" % (
                    self.config.TaskWorker.restHost, str(hte))
                msg += "\nStoring of %s status and ruleId (%s) failed for task %s" % (
                    tapeRecallStatus, ruleId, self.taskName)
                msg += "\nHTTP Headers are: %s" % hte.headers
                raise TaskWorkerException(msg, retry=True)
            if tapeRecallStatusSet[2] == "OK":
                self.logger.info("Status for task %s set to '%s'", self.taskName, tapeRecallStatus)
            if automaticTapeRecallIsImplemented:
                msg += "\nThis task will be automatically submitted as soon as the stage-out is completed."
                self.uploadWarning(msg, self.userproxy, self.taskName)
                raise TapeDatasetException(msg)
            # fall here if could not setup for automatic submission after recall
            msg += "\nPlease monitor recall progress via Rucio or DAS and try again once data are on disk."
            raise TaskWorkerException(msg)

        if system == 'None':
            msg += '\nIt is not possible to request a recall from tape.'
            msg += "\nPlease, check DAS (https://cmsweb.cern.ch/das) and make sure the dataset is accessible on DISK."
            raise TaskWorkerException(msg)

        if system == 'Dynamo':
            raise NotImplementedError
    def _execute(self, config, task):

        # setup logger
        if not self.logger:
            self.logger = logging.getLogger(__name__)
            handler = logging.StreamHandler(sys.stdout)  # pylint: disable=redefined-outer-name
            formatter = logging.Formatter(
                "%(asctime)s:%(levelname)s:%(module)s %(message)s")  # pylint: disable=redefined-outer-name
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)
            self.logger.setLevel(logging.DEBUG)
        else:
            # do not use BaseRecurringAction logger but create a new logger
            # which writes to config.TaskWorker.logsDir/taks/recurring/TapeRecallStatus_YYMMDD-HHMM.log
            self.logger = logging.getLogger('TapeRecallStatus')
            logDir = config.TaskWorker.logsDir + '/tasks/recurring/'
            if not os.path.exists(logDir):
                os.makedirs(logDir)
            timeStamp = time.strftime('%y%m%d-%H%M', time.localtime())
            logFile = 'TapeRecallStatus_' + timeStamp + '.log'
            handler = logging.FileHandler(logDir + logFile)
            formatter = logging.Formatter(
                '%(asctime)s:%(levelname)s:%(module)s:%(message)s')
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)

        mw = MasterWorker(config,
                          logWarning=False,
                          logDebug=False,
                          sequential=True,
                          console=False,
                          name='masterForTapeRecall')

        tapeRecallStatus = 'TAPERECALL'
        self.logger.info("Retrieving %s tasks", tapeRecallStatus)
        recallingTasks = mw.getWork(limit=999999,
                                    getstatus=tapeRecallStatus,
                                    ignoreTWName=True)
        if not recallingTasks:
            self.logger.info("No %s task retrieved.", tapeRecallStatus)
            return

        self.logger.info("Retrieved a total of %d %s tasks",
                         len(recallingTasks), tapeRecallStatus)
        crabserver = mw.crabserver
        for recallingTask in recallingTasks:
            taskName = recallingTask['tm_taskname']
            self.logger.info("Working on task %s", taskName)

            reqId = recallingTask['tm_DDM_reqid']
            if not reqId:
                self.logger.debug(
                    "tm_DDM_reqid' is not defined for task %s, skipping such task",
                    taskName)
                continue
            else:
                msg = "Task points to Rucio RuleId:  %s " % reqId
                self.logger.info(msg)

            if (time.time() - getTimeFromTaskname(
                    str(taskName))) > MAX_DAYS_FOR_TAPERECALL * 24 * 60 * 60:
                self.logger.info(
                    "Task %s is older than %d days, setting its status to FAILED",
                    taskName, MAX_DAYS_FOR_TAPERECALL)
                msg = "The disk replica request (ID: %s) for the input dataset did not complete in %d days." % (
                    reqId, MAX_DAYS_FOR_TAPERECALL)
                failTask(taskName, crabserver, msg, self.logger, 'FAILED')
                continue

            if not 'S3' in recallingTask['tm_cache_url'].upper():
                # when using old crabcache had to worry about sandbox purging after 3 days
                mpl = MyProxyLogon(config=config,
                                   crabserver=crabserver,
                                   myproxylen=self.pollingTime)
                user_proxy = True
                try:
                    mpl.execute(task=recallingTask
                                )  # this adds 'user_proxy' to recallingTask
                except TaskWorkerException as twe:
                    user_proxy = False
                    self.logger.exception(twe)

                # Make sure the task sandbox in the crabcache is not deleted until the tape recall is completed
                if user_proxy:
                    self.refreshSandbox(recallingTask)

            # Retrieve status of recall request
            if not self.rucioClient:
                self.rucioClient = getNativeRucioClient(config=config,
                                                        logger=self.logger)
            try:
                ddmRequest = self.rucioClient.get_replication_rule(reqId)
            except RuleNotFound:
                msg = "Rucio rule id %s not found. Please report to experts" % reqId
                self.logger.error(msg)
                if user_proxy:
                    mpl.uploadWarning(msg, recallingTask['user_proxy'],
                                      taskName)
            if ddmRequest['state'] == 'OK':
                self.logger.info(
                    "Request %s is completed, setting status of task %s to NEW",
                    reqId, taskName)
                mw.updateWork(taskName, recallingTask['tm_task_command'],
                              'NEW')
                # Delete all task warnings (the tapeRecallStatus added a dataset warning which is no longer valid now)
                if user_proxy:
                    mpl.deleteWarnings(recallingTask['user_proxy'], taskName)
            else:
                expiration = ddmRequest[
                    'expires_at']  # this is a datetime.datetime object
                if expiration < datetime.datetime.now():
                    # give up waiting
                    msg = (
                        "Replication request %s for task %s expired. Setting its status to FAILED"
                        % (reqId, taskName))
                    self.logger.info(msg)
                    failTask(taskName, crabserver, msg, self.logger, 'FAILED')
    #config.TaskWorker.cmscert = os.environ["X509_USER_PROXY"]
    #config.TaskWorker.cmskey = os.environ["X509_USER_PROXY"]

    # will user service cert as defined for TW
    config.TaskWorker.cmscert = os.environ["X509_USER_CERT"]
    config.TaskWorker.cmskey = os.environ["X509_USER_KEY"]
    config.TaskWorker.envForCMSWEB = newX509env(X509_USER_CERT=config.TaskWorker.cmscert,
                                                X509_USER_KEY=config.TaskWorker.cmskey)

    config.TaskWorker.instance = 'prod'

    config.Services.Rucio_host = 'https://cms-rucio.cern.ch'
    config.Services.Rucio_account = 'crab_server'
    config.Services.Rucio_authUrl = 'https://cms-rucio-auth.cern.ch'
    config.Services.Rucio_caPath = '/etc/grid-security/certificates/'
    rucioClient = getNativeRucioClient(config=config, logger=logging.getLogger())

    fileset = DBSDataDiscovery(config=config, rucioClient=rucioClient)
    fileset.execute(task={'tm_nonvalid_input_dataset': 'T', 'tm_use_parent': 0, 'user_proxy': 'None',
                          'tm_input_dataset': dbsDataset, 'tm_secondary_input_dataset': dbsSecondaryDataset,
                          'tm_taskname': 'pippo1', 'tm_username':config.Services.Rucio_account,
                          'tm_split_algo' : 'automatic', 'tm_split_args' : {'runs':[], 'lumis':[]},
                          'tm_dbs_url': DBSUrl}, tempDir='')
    
#===============================================================================
#    Some interesting datasets for testing
#    dataset = '/DoubleMuon/Run2018B-PromptReco-v2/AOD'       # on tape
#    dataset = '/DoubleMuon/Run2018B-02Apr2020-v1/NANOAOD'    # isNano
#    dataset = '/DoubleMuon/Run2018B-17Sep2018-v1/MINIAOD'    # parent of above NANOAOD (for secondaryDataset lookup)
#    dataset = '/MuonEG/Run2016B-07Aug17_ver2-v1/AOD'         # no Nano on disk (at least atm)
#    dataset = '/MuonEG/Run2016B-v1/RAW'                      # on tape
Example #5
0
def handleNewTask(resthost, dbInstance, config, task, procnum, *args,
                  **kwargs):
    """Performs the injection of a new task

    :arg str resthost: the hostname where the rest interface is running
    :arg str dbInstance: the rest base url to contact
    :arg WMCore.Configuration config: input configuration
    :arg TaskWorker.DataObjects.Task task: the task to work on
    :arg int procnum: the process number taking care of the work
    :*args and *kwargs: extra parameters currently not defined
    :return: the handler."""
    crabserver = CRABRest(resthost,
                          config.TaskWorker.cmscert,
                          config.TaskWorker.cmskey,
                          retry=20,
                          logger=logging.getLogger(str(procnum)),
                          userAgent='CRABTaskWorker',
                          version=__version__)
    crabserver.setDbInstance(dbInstance)
    handler = TaskHandler(task,
                          procnum,
                          crabserver,
                          config,
                          'handleNewTask',
                          createTempDir=True)
    rucioClient = getNativeRucioClient(config=config, logger=handler.logger)
    handler.addWork(
        MyProxyLogon(config=config,
                     crabserver=crabserver,
                     procnum=procnum,
                     myproxylen=60 * 60 * 24))
    handler.addWork(
        StageoutCheck(config=config,
                      crabserver=crabserver,
                      procnum=procnum,
                      rucioClient=rucioClient))
    if task['tm_job_type'] == 'Analysis':
        if task.get('tm_user_files'):
            handler.addWork(
                UserDataDiscovery(config=config,
                                  crabserver=crabserver,
                                  procnum=procnum))
        else:
            handler.addWork(
                DBSDataDiscovery(config=config,
                                 crabserver=crabserver,
                                 procnum=procnum,
                                 rucioClient=rucioClient))
    elif task['tm_job_type'] == 'PrivateMC':
        handler.addWork(
            MakeFakeFileSet(config=config,
                            crabserver=crabserver,
                            procnum=procnum))
    handler.addWork(
        Splitter(config=config, crabserver=crabserver, procnum=procnum))
    handler.addWork(
        DagmanCreator(config=config,
                      crabserver=crabserver,
                      procnum=procnum,
                      rucioClient=rucioClient))
    if task['tm_dry_run'] == 'T':
        handler.addWork(
            DryRunUploader(config=config,
                           crabserver=crabserver,
                           procnum=procnum))
    else:
        handler.addWork(
            DagmanSubmitter(config=config,
                            crabserver=crabserver,
                            procnum=procnum))

    return handler.actionWork(args, kwargs)