Python insertJobIdSid Examples

Programming Language: Python

Namespace/Package Name: ServerUtilities

Method/Function: insertJobIdSid

Examples at hotexamples.com: 10

Python insertJobIdSid - 10 examples found. These are the top rated real world Python examples of ServerUtilities.insertJobIdSid extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: DagmanKiller.py Project: belforte/CRABServer

    def executeInternal(self, apmon, *args, **kwargs):
        #Marco: I guess these value errors only happens for development instances
        if 'task' not in kwargs:
            raise ValueError("No task specified.")
        self.task = kwargs['task']
        if 'tm_taskname' not in self.task:
            raise ValueError("No taskname specified")
        self.workflow = self.task['tm_taskname']
        if 'user_proxy' not in self.task:
            raise ValueError("No proxy provided")
        self.proxy = self.task['user_proxy']

        self.logger.info("About to kill workflow: %s." % self.workflow)

        self.workflow = str(self.workflow)
        if not WORKFLOW_RE.match(self.workflow):
            raise Exception("Invalid workflow name.")

        # Query HTCondor for information about running jobs and update Dashboard appropriately
        if self.task['tm_collector']:
            self.backendurls['htcondorPool'] = self.task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        address = ""
        try:
            self.schedd, address = loc.getScheddObjNew(self.task['tm_schedd'])
        except Exception as exp:
            msg  = "The CRAB server backend was not able to contact the Grid scheduler."
            msg += " Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Message from the scheduler: %s" % (str(exp))
            self.logger.exception("%s: %s" % (self.workflow, msg))
            raise TaskWorkerException(msg)

        try:
            hostname = socket.getfqdn()
        except:
            hostname = ''

        const = 'CRAB_ReqName =?= %s && TaskType=?="Job"' % HTCondorUtils.quote(self.workflow)
        try:
            for ad in list(self.schedd.xquery(const, ['CRAB_Id', 'CRAB_Retry'])):
                if ('CRAB_Id' not in ad) or ('CRAB_Retry' not in ad):
                    continue
                jobid = str(ad.eval('CRAB_Id'))
                jobretry = str(ad.eval('CRAB_Retry'))
                jinfo = {'broker': hostname,
                         'bossId': jobid,
                         'StatusValue': 'killed',
                        }
                insertJobIdSid(jinfo, jobid, self.workflow, jobretry)
                self.logger.info("Sending kill info to Dashboard: %s" % str(jinfo))
                apmon.sendToML(jinfo)
        except:
            self.logger.exception("Failed to notify Dashboard of job kills") #warning

        # Note that we can not send kills for jobs not in queue at this time; we'll need the
        # DAG FINAL node to be fixed and the node status to include retry number.
        return self.killAll(const)

Example #2

Show file

File: DagmanKiller.py Project: todor-ivanov/CRABServer

    def executeInternal(self, apmon, *args, **kwargs):
        #Marco: I guess these value errors only happens for development instances
        if 'task' not in kwargs:
            raise ValueError("No task specified.")
        self.task = kwargs['task']
        if 'tm_taskname' not in self.task:
            raise ValueError("No taskname specified")
        self.workflow = self.task['tm_taskname']
        if 'user_proxy' not in self.task:
            raise ValueError("No proxy provided")
        self.proxy = self.task['user_proxy']

        self.logger.info("About to kill workflow: %s." % self.workflow)

        self.workflow = str(self.workflow)
        if not WORKFLOW_RE.match(self.workflow):
            raise Exception("Invalid workflow name.")

        # Query HTCondor for information about running jobs and update Dashboard appropriately
        if self.task['tm_collector']:
            self.backendurls['htcondorPool'] = self.task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        address = ""
        try:
            self.schedd, address = loc.getScheddObjNew(self.task['tm_schedd'])
        except Exception as exp:
            msg  = "The CRAB server backend was not able to contact the Grid scheduler."
            msg += " Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Message from the scheduler: %s" % (str(exp))
            self.logger.exception("%s: %s" % (self.workflow, msg))
            raise TaskWorkerException(msg)

        try:
            hostname = socket.getfqdn()
        except:
            hostname = ''

        const = 'CRAB_ReqName =?= %s && TaskType=?="Job"' % HTCondorUtils.quote(self.workflow)
        try:
            for ad in list(self.schedd.xquery(const, ['CRAB_Id', 'CRAB_Retry'])):
                if ('CRAB_Id' not in ad) or ('CRAB_Retry' not in ad):
                    continue
                jobid = str(ad.eval('CRAB_Id'))
                jobretry = str(ad.eval('CRAB_Retry'))
                jinfo = {'broker': hostname,
                         'bossId': jobid,
                         'StatusValue': 'killed',
                        }
                insertJobIdSid(jinfo, jobid, self.workflow, jobretry)
                self.logger.info("Sending kill info to Dashboard: %s" % str(jinfo))
                apmon.sendToML(jinfo)
        except:
            self.logger.exception("Failed to notify Dashboard of job kills") #warning

        # Note that we can not send kills for jobs not in queue at this time; we'll need the
        # DAG FINAL node to be fixed and the node status to include retry number.
        return self.killAll(const)

Example #3

Show file

File: PreJob.py Project: ddaina/CRABServer

    def update_dashboard(self, crab_retry):
        """
        Need a doc string here.
        """
        if not self.task_ad:
            return
        params = {
            'tool':
            'crab3',
            'SubmissionType':
            'crab3',
            'JSToolVersion':
            '3.3.0',
            'tool_ui':
            os.environ.get('HOSTNAME', ''),
            'scheduler':
            'GLIDEIN',
            'GridName':
            self.task_ad['CRAB_UserDN'],
            'ApplicationVersion':
            self.task_ad['CRAB_JobSW'],
            'taskType':
            self.task_ad.get("CRAB_DashboardTaskType", 'analysistest'),
            'vo':
            'cms',
            'CMSUser':
            self.task_ad['CRAB_UserHN'],
            'user':
            self.task_ad['CRAB_UserHN'],
            'taskId':
            self.task_ad['CRAB_ReqName'],
            'datasetFull':
            self.task_ad['DESIRED_CMSDataset'],
            'resubmitter':
            self.task_ad['CRAB_UserHN'],
            'exe':
            'cmsRun',
            'broker':
            self.backend,
            'bossId':
            str(self.job_id),
            'localId':
            '',
            'SyncGridJobId':
            'https://glidein.cern.ch/%s/%s' %
            (self.job_id, self.task_ad['CRAB_ReqName'].replace("_", ":")),
        }

        if not self.userWebDirPrx:
            storage_rules = htcondor.param['CRAB_StorageRules']
            self.userWebDirPrx = getWebdirForDb(
                str(self.task_ad.get('CRAB_ReqName')), storage_rules)

        self.logger.info("User web dir: %s", self.userWebDirPrx)

        insertJobIdSid(params, self.job_id, self.task_ad['CRAB_ReqName'],
                       crab_retry)

Example #4

Show file

File: PreJob.py Project: mmascher/CRABServer

    def update_dashboard(self, crab_retry):
        """
        Need a doc string here.
        """
        if not self.task_ad:
            return
        params = {'tool': 'crab3',
                  'SubmissionType': 'crab3',
                  'JSToolVersion': '3.3.0',
                  'tool_ui': os.environ.get('HOSTNAME', ''),
                  'scheduler': 'GLIDEIN',
                  'GridName': self.task_ad['CRAB_UserDN'],
                  'ApplicationVersion': self.task_ad['CRAB_JobSW'],
                  'taskType': self.task_ad.get("CRAB_DashboardTaskType", 'analysistest'),
                  'vo': 'cms',
                  'CMSUser': self.task_ad['CRAB_UserHN'],
                  'user': self.task_ad['CRAB_UserHN'],
                  'taskId': self.task_ad['CRAB_ReqName'],
                  'datasetFull': self.task_ad['DESIRED_CMSDataset'],
                  'resubmitter': self.task_ad['CRAB_UserHN'],
                  'exe': 'cmsRun',
                  'broker': self.backend,
                  'bossId': str(self.job_id),
                  'localId': '',
                  'SyncGridJobId': 'https://glidein.cern.ch/%s/%s' % (self.job_id, self.task_ad['CRAB_ReqName'].replace("_", ":")),
                 }

        storage_rules = htcondor.param['CRAB_StorageRules']
        userWebDir = getWebdirForDb(str(self.task_ad.get('CRAB_ReqName')), storage_rules)

        userWebDirPrx = ""
        try:
            with open('proxied_webdir') as fd:
                proxied_webdir = fd.read()
            userWebDirPrx = proxied_webdir
        except IOError as e:
            self.logger.error(("'I/O error(%s): %s', when looking for the proxied_webdir file. Might be normal"
                         " if the schedd does not have a proxiedurl in the REST external config." % (e.errno, e.strerror)))

        self.logger.info("User web dir proxy: " + userWebDirPrx)
        self.logger.info("web dir: " + userWebDir)

        if userWebDirPrx:
            setDashboardLogs(params, userWebDirPrx, self.job_id, crab_retry)
        elif userWebDir:
            setDashboardLogs(params, userWebDir, self.job_id, crab_retry)
        else:
            print("Not setting dashboard logfiles as I cannot find CRAB_UserWebDir nor CRAB_UserWebDirPrx.")

        insertJobIdSid(params, self.job_id, self.task_ad['CRAB_ReqName'], crab_retry)
        apmon = ApmonIf()
        self.logger.debug("Dashboard task info: %s" % str(params))
        apmon.sendToML(params)
        apmon.free()

Example #5

Show file

File: PreJob.py Project: vlimant/CRABServer

    def update_dashboard(self, crab_retry):
        """
        Need a doc string here.
        """
        if not self.task_ad:
            return
        params = {'tool': 'crab3',
                  'SubmissionType': 'crab3',
                  'JSToolVersion': '3.3.0',
                  'tool_ui': os.environ.get('HOSTNAME', ''),
                  'scheduler': 'GLIDEIN',
                  'GridName': self.task_ad['CRAB_UserDN'],
                  'ApplicationVersion': self.task_ad['CRAB_JobSW'],
                  'taskType': self.task_ad.get("CRAB_DashboardTaskType", 'analysistest'),
                  'vo': 'cms',
                  'CMSUser': self.task_ad['CRAB_UserHN'],
                  'user': self.task_ad['CRAB_UserHN'],
                  'taskId': self.task_ad['CRAB_ReqName'],
                  'datasetFull': self.task_ad['DESIRED_CMSDataset'],
                  'resubmitter': self.task_ad['CRAB_UserHN'],
                  'exe': 'cmsRun',
                  'broker': self.backend,
                  'bossId': str(self.job_id),
                  'localId': '',
                  'SyncGridJobId': 'https://glidein.cern.ch/%s/%s' % (self.job_id, self.task_ad['CRAB_ReqName'].replace("_", ":")),
                 }

        storage_rules = htcondor.param['CRAB_StorageRules']
        userWebDir = getWebdirForDb(str(self.task_ad.get('CRAB_ReqName')), storage_rules)

        userWebDirPrx = ""
        try:
            with open('proxied_webdir') as fd:
                proxied_webdir = fd.read()
            userWebDirPrx = proxied_webdir
        except IOError as e:
            self.logger.error(("'I/O error(%s): %s', when looking for the proxied_webdir file. Might be normal"
                         " if the schedd does not have a proxiedurl in the REST external config." % (e.errno, e.strerror)))

        self.logger.info("User web dir proxy: " + userWebDirPrx)
        self.logger.info("web dir: " + userWebDir)

        if userWebDirPrx:
            setDashboardLogs(params, userWebDirPrx, self.job_id, crab_retry)
        elif userWebDir:
            setDashboardLogs(params, userWebDir, self.job_id, crab_retry)
        else:
            print("Not setting dashboard logfiles as I cannot find CRAB_UserWebDir nor CRAB_UserWebDirPrx.")

        insertJobIdSid(params, self.job_id, self.task_ad['CRAB_ReqName'], crab_retry)
        apmon = ApmonIf()
        self.logger.debug("Dashboard task info: %s" % str(params))
        apmon.sendToML(params)
        apmon.free()

Example #6

Show file

File: DagmanCreator.py Project: HassenRiahi/CRABServer

    def createSubdag(self, splitterResult, **kwargs):

        startjobid = 0
        dagSpecs = []

        if hasattr(self.config.TaskWorker, 'stageoutPolicy'):
            kwargs['task']['stageoutpolicy'] = ",".join(
                self.config.TaskWorker.stageoutPolicy)
        else:
            kwargs['task']['stageoutpolicy'] = "local,remote"

        ## In the future this parameter may be set by the user in the CRAB configuration
        ## file and we would take it from the Task DB.
        kwargs['task']['numautomjobretries'] = getattr(self.config.TaskWorker,
                                                       'numAutomJobRetries', 2)

        info = self.makeJobSubmit(kwargs['task'])

        outfiles = kwargs['task']['tm_outfiles'] + kwargs['task'][
            'tm_tfile_outfiles'] + kwargs['task']['tm_edm_outfiles']

        os.chmod("CMSRunAnalysis.sh", 0o755)

        # This config setting acts as a global black list
        global_blacklist = set(self.getBlacklistedSites())
        self.logger.debug("CRAB site blacklist: %s" % (list(global_blacklist)))

        # This is needed for Site Metrics
        # It should not block any site for Site Metrics and if needed for other activities
        # self.config.TaskWorker.ActivitiesToRunEverywhere = ['hctest', 'hcdev']
        if hasattr(self.config.TaskWorker, 'ActivitiesToRunEverywhere') and \
                   kwargs['task']['tm_activity'] in self.config.TaskWorker.ActivitiesToRunEverywhere:
            global_blacklist = set()
            self.logger.debug("Ignoring the CRAB site blacklist.")

        sitead = classad.ClassAd()
        siteinfo = {'group_sites': {}, 'group_datasites': {}}

        blocksWithNoLocations = set()

        siteWhitelist = set(kwargs['task']['tm_site_whitelist'])
        siteBlacklist = set(kwargs['task']['tm_site_blacklist'])
        self.logger.debug("Site whitelist: %s" % (list(siteWhitelist)))
        self.logger.debug("Site blacklist: %s" % (list(siteBlacklist)))

        if siteWhitelist & global_blacklist:
            msg = "The following sites from the user site whitelist are blacklisted by the CRAB server: %s." % (
                list(siteWhitelist & global_blacklist))
            msg += " Since the CRAB server blacklist has precedence, these sites are not considered in the user whitelist."
            self.uploadWarning(msg, kwargs['task']['user_proxy'],
                               kwargs['task']['tm_taskname'])
            self.logger.warning(msg)

        if siteBlacklist & siteWhitelist:
            msg = "The following sites appear in both the user site blacklist and whitelist: %s." % (
                list(siteBlacklist & siteWhitelist))
            msg += " Since the whitelist has precedence, these sites are not considered in the blacklist."
            self.uploadWarning(msg, kwargs['task']['user_proxy'],
                               kwargs['task']['tm_taskname'])
            self.logger.warning(msg)

        ignoreLocality = kwargs['task']['tm_ignore_locality'] == 'T'
        self.logger.debug("Ignore locality: %s" % (ignoreLocality))

        for jobgroup in splitterResult:
            jobs = jobgroup.getJobs()

            blocks = set()
            for job in jobs:
                for inputfile in job['input_files']:
                    blocks.add(inputfile['block'])
            self.logger.debug("Blocks: %s" % list(blocks))

            if not jobs:
                locations = set()
            else:
                locations = set(jobs[0]['input_files'][0]['locations'])
            self.logger.debug("Locations: %s" % (list(locations)))

            ## Discard the blocks that have no locations. This can happen when a block is
            ## still open in PhEDEx. Newly created datasets from T0 (at least) have a large
            ## chance of having some block which is closed in DBS but not in PhEDEx.
            ## Open blocks in PhEDEx can have a location; it is WMCore who is returning no
            ## location.
            ## This is how a block is constructed during data taking:
            ## 1) an open block in T0 is injected in PhEDEx;
            ## 2) files are being added to the block in T0;
            ## 3) data are transferred by PhEDEx if a subscription is present;
            ## 4) once the block is finished:
            ##   a) the block is inserted into DBS as a closed block (before this, DBS has
            ##      no knowledge about the block);
            ##   b) block is closed in PhEDEx.
            if not locations and not ignoreLocality:
                blocksWithNoLocations = blocksWithNoLocations.union(blocks)
                continue

            if ignoreLocality:
                sbj = SiteDB.SiteDBJSON({
                    "key": self.config.TaskWorker.cmskey,
                    "cert": self.config.TaskWorker.cmscert
                })
                try:
                    possiblesites = set(sbj.getAllCMSNames())
                except Exception as ex:
                    msg = "The CRAB3 server backend could not contact SiteDB to get the list of all CMS sites."
                    msg += " This could be a temporary SiteDB glitch."
                    msg += " Please try to submit a new task (resubmit will not work)"
                    msg += " and contact the experts if the error persists."
                    msg += "\nError reason: %s" % (
                        str(ex)
                    )  #TODO add the sitedb url so the user can check themselves!
                    raise TaskWorker.WorkerExceptions.TaskWorkerException(msg)
            else:
                possiblesites = locations
            ## At this point 'possiblesites' should never be empty.
            self.logger.debug("Possible sites: %s" % (list(possiblesites)))

            ## Apply the global site blacklist.
            availablesites = possiblesites - global_blacklist

            ## TODO: The messsages below do not clarify that here it only matters the part
            ## of the dataset that passed the lumi-mask/run-range selection.

            ## Abort the submission of the task if (part of?) the dataset is available only
            ## on sites that are blacklisted by the CRAB server.
            ## Or should we submit at least the jobs on the part of the dataset that
            ## survives the blacklisting? Comment S.Belforte Sep,2015: So far DDM policy
            ## is to replicate entire datasets, not scatter them around. Once we will have
            ## very large datasets that can happen, but it is not the case now.
            if not availablesites:
                msg = "The CRAB server backend refuses to send jobs to the Grid scheduler."
                msg += " No site available for submission of task %s" % (
                    kwargs['task']['tm_taskname'])
                msg += "\n\t\t\t\tThe sites available for submission of task %s are blacklisted by the CRAB3 server." % (
                    kwargs['task']['tm_taskname'])
                msg += "\n\t\t\t\tThis is the list of in principle available sites: %s" % (
                    list(possiblesites))
                msg += "\n\t\t\t\tThis is the list of sites that are blacklisted by the CRAB3 server: %s" % (
                    list(global_blacklist))
                raise TaskWorker.WorkerExceptions.NoAvailableSite(msg)

            ## Abort the submission of the task if (part of?) the dataset is available only
            ## on sites that are removed after applying the user site blacklist/whitelist.
            ## Or should we submit at least the jobs on the part of the dataset that
            ## survives the blacklisting/whitelisting? (See S.Belforte comment above.)
            # NOTE: User can still shoot themselves in the foot with the resubmit blacklist
            # However, this is the last chance we have to warn the users about an impossible task at submit time.
            available = set(availablesites)
            if siteWhitelist:
                available &= siteWhitelist
                if not available:
                    msg = "The CRAB server backend refuses to send jobs to the Grid scheduler."
                    msg += " You put %s as site whitelist," % (
                        list(siteWhitelist))
                    msg += " but the input dataset '%s' can only be accessed at these sites: %s." % (
                        kwargs['task']['tm_input_dataset'],
                        list(availablesites))
                    msg += " Please check your site whitelist."
                    raise TaskWorker.WorkerExceptions.NoAvailableSite(msg)
            available -= (siteBlacklist - siteWhitelist)
            if not available:
                msg = "The CRAB server backend refuses to send jobs to the Grid scheduler."
                msg += " You put %s as site blacklist," % (list(siteBlacklist -
                                                                siteWhitelist))
                msg += " when the input dataset '%s' can actually only be accessed at these sites: %s." % (
                    kwargs['task']['tm_input_dataset'], list(availablesites))
                msg += " Please check in DAS the locations of the input dataset."
                msg += " Hint: the ignoreLocality option might help."
                raise TaskWorker.WorkerExceptions.NoAvailableSite(msg)

            availablesites = [str(i) for i in availablesites]
            datasites = jobs[0]['input_files'][0]['locations']
            self.logger.info("Resulting available sites: %s" %
                             (list(availablesites)))

            if siteWhitelist or siteBlacklist:
                msg = "The site whitelist and blacklist will be applied by the pre-job."
                msg += " This is expected to result in DESIRED_SITES = %s" % (
                    list(available))
                self.logger.debug(msg)

            jobgroupDagSpecs, startjobid = self.makeDagSpecs(
                kwargs['task'], sitead, siteinfo, jobgroup,
                list(blocks)[0], availablesites, datasites, outfiles,
                startjobid)
            dagSpecs += jobgroupDagSpecs

        if not dagSpecs:
            msg = "No jobs created for task %s." % (
                kwargs['task']['tm_taskname'])
            if blocksWithNoLocations:
                msg = "The CRAB server backend refuses to send jobs to the Grid scheduler."
                msg += " No locations found for dataset '%s'" % (
                    kwargs['task']['tm_input_dataset'])
                msg += " (or at least for the part of the dataset that passed the lumi-mask and/or run-range selection)."
            raise TaskWorker.WorkerExceptions.NoAvailableSite(msg)
        if blocksWithNoLocations:
            msg = "The following blocks from dataset '%s' were skipped," % (
                kwargs['task']['tm_input_dataset'])
            msg += " because they have no locations: %s." % (sorted(
                list(blocksWithNoLocations)))
            self.uploadWarning(msg, kwargs['task']['user_proxy'],
                               kwargs['task']['tm_taskname'])
            self.logger.warning(msg)

        ## Write down the DAG as needed by DAGMan.
        dag = DAG_HEADER % {
            'resthost': kwargs['task']['resthost'],
            'resturiwfdb': kwargs['task']['resturinoapi'] + '/workflowdb'
        }
        for dagSpec in dagSpecs:
            dag += DAG_FRAGMENT % dagSpec

        ## Create a tarball with all the job lumi files.
        run_and_lumis_tar = tarfile.open("run_and_lumis.tar.gz", "w:gz")
        ## Also creating a tarball with the dataset input files.
        ## Each .txt file in the tarball contains a list of dataset files to be used for the job.
        input_files_tar = tarfile.open("input_files.tar.gz", "w:gz")
        for dagSpec in dagSpecs:
            job_lumis_file = 'job_lumis_' + str(dagSpec['count']) + '.json'
            job_input_file_list = 'job_input_file_list_' + str(
                dagSpec['count']) + '.txt'
            with open(job_lumis_file, "w") as fd:
                fd.write(str(dagSpec['runAndLumiMask']))
            with open(job_input_file_list, "w") as fd:
                fd.write(str(dagSpec['inputFiles']))
            run_and_lumis_tar.add(job_lumis_file)
            input_files_tar.add(job_input_file_list)
            os.remove(job_lumis_file)
            os.remove(job_input_file_list)
        run_and_lumis_tar.close()
        input_files_tar.close()

        ## Save the DAG into a file.
        with open("RunJobs.dag", "w") as fd:
            fd.write(dag)

        with open("site.ad", "w") as fd:
            fd.write(str(sitead))

        with open("site.ad.json", "w") as fd:
            json.dump(siteinfo, fd)

        task_name = kwargs['task'].get('CRAB_ReqName',
                                       kwargs['task'].get('tm_taskname', ''))
        userdn = kwargs['task'].get('CRAB_UserDN',
                                    kwargs['task'].get('tm_user_dn', ''))

        info["jobcount"] = len(dagSpecs)
        maxpost = getattr(self.config.TaskWorker, 'maxPost', 20)
        if maxpost == -1:
            maxpost = info['jobcount']
        elif maxpost == 0:
            maxpost = int(max(20, info['jobcount'] * .1))
        info['maxpost'] = maxpost

        if info.get('faillimit') == None:
            info['faillimit'] = -1
            #if info['jobcount'] > 200
            #    info['faillimit'] = 100
            #else:
            #    info['faillimit'] = -1
        elif info.get('faillimit') < 0:
            info['faillimit'] = -1

        # Info for ML:
        target_se = ''
        max_len_target_se = 900
        for site in map(str, availablesites):
            if len(target_se) > max_len_target_se:
                target_se += ',Many_More'
                break
            if len(target_se):
                target_se += ','
            target_se += site
        ml_info = info.setdefault('apmon', [])
        for idx in range(1, info['jobcount'] + 1):
            taskid = kwargs['task']['tm_taskname']
            jinfo = {
                'broker': os.environ.get('HOSTNAME', ''),
                'bossId': str(idx),
                'TargetSE': target_se,
                'localId': '',
                'StatusValue': 'pending',
            }
            insertJobIdSid(jinfo, idx, taskid, 0)
            ml_info.append(jinfo)

        # When running in standalone mode, we want to record the number of jobs in the task
        if ('CRAB_ReqName' in kwargs['task']) and ('CRAB_UserDN'
                                                   in kwargs['task']):
            const = 'TaskType =?= \"ROOT\" && CRAB_ReqName =?= "%s" && CRAB_UserDN =?= "%s"' % (
                task_name, userdn)
            cmd = "condor_qedit -const '%s' CRAB_JobCount %d" % (const,
                                                                 len(dagSpecs))
            self.logger.debug("+ %s" % cmd)
            status, output = commands.getstatusoutput(cmd)
            if status:
                self.logger.error(output)
                self.logger.error("Failed to record the number of jobs.")
                return 1

        return info, splitterResult

Example #7

Show file

File: DagmanCreator.py Project: elasticaso/CRABServer

    def createSubdag(self, splitterResult, **kwargs):

        startjobid = 0
        dagSpecs = []

        if hasattr(self.config.TaskWorker, 'stageoutPolicy'):
            kwargs['task']['stageoutpolicy'] = ",".join(self.config.TaskWorker.stageoutPolicy)
        else:
            kwargs['task']['stageoutpolicy'] = "local,remote"

        ## In the future this parameter may be set by the user in the CRAB configuration
        ## file and we would take it from the Task DB.
        kwargs['task']['numautomjobretries'] = getattr(self.config.TaskWorker, 'numAutomJobRetries', 2)

        info = self.makeJobSubmit(kwargs['task'])

        outfiles = kwargs['task']['tm_outfiles'] + kwargs['task']['tm_tfile_outfiles'] + kwargs['task']['tm_edm_outfiles']

        os.chmod("CMSRunAnalysis.sh", 0o755)

        # This config setting acts as a global black list
        global_blacklist = set(self.getBlacklistedSites())
        self.logger.debug("CRAB site blacklist: %s" % (list(global_blacklist)))

        # This is needed for Site Metrics
        # It should not block any site for Site Metrics and if needed for other activities
        # self.config.TaskWorker.ActivitiesToRunEverywhere = ['hctest', 'hcdev']
        if hasattr(self.config.TaskWorker, 'ActivitiesToRunEverywhere') and \
                   kwargs['task']['tm_activity'] in self.config.TaskWorker.ActivitiesToRunEverywhere:
            global_blacklist = set()
            self.logger.debug("Ignoring the CRAB site blacklist.")

        sitead = classad.ClassAd()
        siteinfo = {'group_sites': {}, 'group_datasites': {}}

        blocksWithNoLocations = set()

        siteWhitelist = set(kwargs['task']['tm_site_whitelist'])
        siteBlacklist = set(kwargs['task']['tm_site_blacklist'])
        self.logger.debug("Site whitelist: %s" % (list(siteWhitelist)))
        self.logger.debug("Site blacklist: %s" % (list(siteBlacklist)))

        if siteWhitelist & global_blacklist:
            msg  = "The following sites from the user site whitelist are blacklisted by the CRAB server: %s." % (list(siteWhitelist & global_blacklist))
            msg += " Since the CRAB server blacklist has precedence, these sites are not considered in the user whitelist."
            self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname'])
            self.logger.warning(msg)

        if siteBlacklist & siteWhitelist:
            msg  = "The following sites appear in both the user site blacklist and whitelist: %s." % (list(siteBlacklist & siteWhitelist))
            msg += " Since the whitelist has precedence, these sites are not considered in the blacklist."
            self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname'])
            self.logger.warning(msg)

        ignoreLocality = kwargs['task']['tm_ignore_locality'] == 'T'
        self.logger.debug("Ignore locality: %s" % (ignoreLocality))

        for jobgroup in splitterResult:
            jobs = jobgroup.getJobs()

            blocks = set()
            for job in jobs:
                for inputfile in job['input_files']:
                    blocks.add(inputfile['block'])
            self.logger.debug("Blocks: %s" % list(blocks))

            if not jobs:
                locations = set()
            else:
                locations = set(jobs[0]['input_files'][0]['locations'])
            self.logger.debug("Locations: %s" % (list(locations)))

            ## Discard the blocks that have no locations. This can happen when a block is
            ## still open in PhEDEx. Newly created datasets from T0 (at least) have a large
            ## chance of having some block which is closed in DBS but not in PhEDEx.
            ## Open blocks in PhEDEx can have a location; it is WMCore who is returning no
            ## location.
            ## This is how a block is constructed during data taking:
            ## 1) an open block in T0 is injected in PhEDEx;
            ## 2) files are being added to the block in T0;
            ## 3) data are transferred by PhEDEx if a subscription is present;
            ## 4) once the block is finished:
            ##   a) the block is inserted into DBS as a closed block (before this, DBS has
            ##      no knowledge about the block);
            ##   b) block is closed in PhEDEx.
            if not locations and not ignoreLocality:
                blocksWithNoLocations = blocksWithNoLocations.union(blocks)
                continue

            if ignoreLocality:
                sbj = SiteDB.SiteDBJSON({"key": self.config.TaskWorker.cmskey,
                                         "cert": self.config.TaskWorker.cmscert})
                try:
                    possiblesites = set(sbj.getAllCMSNames())
                except Exception as ex:
                    msg  = "The CRAB3 server backend could not contact SiteDB to get the list of all CMS sites."
                    msg += " This could be a temporary SiteDB glitch."
                    msg += " Please try to submit a new task (resubmit will not work)"
                    msg += " and contact the experts if the error persists."
                    msg += "\nError reason: %s" % (str(ex)) #TODO add the sitedb url so the user can check themselves!
                    raise TaskWorker.WorkerExceptions.TaskWorkerException(msg)
            else:
                possiblesites = locations
            ## At this point 'possiblesites' should never be empty.
            self.logger.debug("Possible sites: %s" % (list(possiblesites)))

            ## Apply the global site blacklist.
            availablesites = possiblesites - global_blacklist

            ## TODO: The messsages below do not clarify that here it only matters the part
            ## of the dataset that passed the lumi-mask/run-range selection.

            ## Abort the submission of the task if (part of?) the dataset is available only
            ## on sites that are blacklisted by the CRAB server.
            ## Or should we submit at least the jobs on the part of the dataset that
            ## survives the blacklisting? Comment S.Belforte Sep,2015: So far DDM policy
            ## is to replicate entire datasets, not scatter them around. Once we will have
            ## very large datasets that can happen, but it is not the case now.
            if not availablesites:
                msg  = "The CRAB server backend refuses to send jobs to the Grid scheduler."
                msg += " No site available for submission of task %s" % (kwargs['task']['tm_taskname'])
                msg += "\n\t\t\t\tThe sites available for submission of task %s are blacklisted by the CRAB3 server." % (kwargs['task']['tm_taskname'])
                msg += "\n\t\t\t\tThis is the list of in principle available sites: %s" % (list(possiblesites))
                msg += "\n\t\t\t\tThis is the list of sites that are blacklisted by the CRAB3 server: %s" % (list(global_blacklist))
                raise TaskWorker.WorkerExceptions.NoAvailableSite(msg)

            ## Abort the submission of the task if (part of?) the dataset is available only
            ## on sites that are removed after applying the user site blacklist/whitelist.
            ## Or should we submit at least the jobs on the part of the dataset that
            ## survives the blacklisting/whitelisting? (See S.Belforte comment above.)
            # NOTE: User can still shoot themselves in the foot with the resubmit blacklist
            # However, this is the last chance we have to warn the users about an impossible task at submit time.
            available = set(availablesites)
            if siteWhitelist:
                available &= siteWhitelist
                if not available:
                    msg  = "The CRAB server backend refuses to send jobs to the Grid scheduler."
                    msg += " You put %s as site whitelist," % (list(siteWhitelist))
                    msg += " but the input dataset '%s' can only be accessed at these sites: %s." % (kwargs['task']['tm_input_dataset'], list(availablesites))
                    msg += " Please check your site whitelist."
                    raise TaskWorker.WorkerExceptions.NoAvailableSite(msg)
            available -= (siteBlacklist - siteWhitelist)
            if not available:
                msg  = "The CRAB server backend refuses to send jobs to the Grid scheduler."
                msg += " You put %s as site blacklist," % (list(siteBlacklist - siteWhitelist))
                msg += " when the input dataset '%s' can actually only be accessed at these sites: %s." % (kwargs['task']['tm_input_dataset'], list(availablesites))
                msg += " Please check in DAS the locations of the input dataset."
                msg += " Hint: the ignoreLocality option might help."
                raise TaskWorker.WorkerExceptions.NoAvailableSite(msg)

            availablesites = [str(i) for i in availablesites]
            datasites = jobs[0]['input_files'][0]['locations']
            self.logger.info("Resulting available sites: %s" % (list(availablesites)))

            if siteWhitelist or siteBlacklist:
                msg  = "The site whitelist and blacklist will be applied by the pre-job."
                msg += " This is expected to result in DESIRED_SITES = %s" % (list(available))
                self.logger.debug(msg)

            jobgroupDagSpecs, startjobid = self.makeDagSpecs(kwargs['task'], sitead, siteinfo, jobgroup, list(blocks)[0], availablesites, datasites, outfiles, startjobid)
            dagSpecs += jobgroupDagSpecs

        if not dagSpecs:
            msg = "No jobs created for task %s." % (kwargs['task']['tm_taskname'])
            if blocksWithNoLocations:
                msg  = "The CRAB server backend refuses to send jobs to the Grid scheduler."
                msg += " No locations found for dataset '%s'" % (kwargs['task']['tm_input_dataset'])
                msg += " (or at least for the part of the dataset that passed the lumi-mask and/or run-range selection)."
            raise TaskWorker.WorkerExceptions.NoAvailableSite(msg)
        if blocksWithNoLocations:
            msg  = "The following blocks from dataset '%s' were skipped," % (kwargs['task']['tm_input_dataset'])
            msg += " because they have no locations: %s." % (sorted(list(blocksWithNoLocations)))
            self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname'])
            self.logger.warning(msg)

        ## Write down the DAG as needed by DAGMan.
        dag = DAG_HEADER % {'resthost': kwargs['task']['resthost'], 'resturiwfdb': kwargs['task']['resturinoapi'] + '/workflowdb'}
        for dagSpec in dagSpecs:
            dag += DAG_FRAGMENT % dagSpec

        ## Create a tarball with all the job lumi files.
        run_and_lumis_tar = tarfile.open("run_and_lumis.tar.gz", "w:gz")
        ## Also creating a tarball with the dataset input files.
        ## Each .txt file in the tarball contains a list of dataset files to be used for the job.
        input_files_tar = tarfile.open("input_files.tar.gz", "w:gz")
        for dagSpec in dagSpecs:
            job_lumis_file = 'job_lumis_'+ str(dagSpec['count']) +'.json'
            job_input_file_list = 'job_input_file_list_' + str(dagSpec['count']) + '.txt'
            with open(job_lumis_file, "w") as fd:
                fd.write(str(dagSpec['runAndLumiMask']))
            with open(job_input_file_list, "w") as fd:
                fd.write(str(dagSpec['inputFiles']))
            run_and_lumis_tar.add(job_lumis_file)
            input_files_tar.add(job_input_file_list)
            os.remove(job_lumis_file)
            os.remove(job_input_file_list)
        run_and_lumis_tar.close()
        input_files_tar.close()

        ## Save the DAG into a file.
        with open("RunJobs.dag", "w") as fd:
            fd.write(dag)

        with open("site.ad", "w") as fd:
            fd.write(str(sitead))

        with open("site.ad.json", "w") as fd:
            json.dump(siteinfo, fd)

        task_name = kwargs['task'].get('CRAB_ReqName', kwargs['task'].get('tm_taskname', ''))
        userdn = kwargs['task'].get('CRAB_UserDN', kwargs['task'].get('tm_user_dn', ''))

        info["jobcount"] = len(dagSpecs)
        maxpost = getattr(self.config.TaskWorker, 'maxPost', 20)
        if maxpost == -1:
            maxpost = info['jobcount']
        elif maxpost == 0:
            maxpost = int(max(20, info['jobcount']*.1))
        info['maxpost'] = maxpost

        if info.get('faillimit') == None:
            info['faillimit'] = -1
            #if info['jobcount'] > 200
            #    info['faillimit'] = 100
            #else:
            #    info['faillimit'] = -1
        elif info.get('faillimit') < 0:
            info['faillimit'] = -1

        # Info for ML:
        target_se = ''
        max_len_target_se = 900
        for site in map(str, availablesites):
            if len(target_se) > max_len_target_se:
                target_se += ',Many_More'
                break
            if len(target_se):
                target_se += ','
            target_se += site
        ml_info = info.setdefault('apmon', [])
        for idx in range(1, info['jobcount']+1):
            taskid = kwargs['task']['tm_taskname']
            jinfo = {'broker': os.environ.get('HOSTNAME', ''),
                     'bossId': str(idx),
                     'TargetSE': target_se,
                     'localId': '',
                     'StatusValue': 'pending',
                    }
            insertJobIdSid(jinfo, idx, taskid, 0)
            ml_info.append(jinfo)

        # When running in standalone mode, we want to record the number of jobs in the task
        if ('CRAB_ReqName' in kwargs['task']) and ('CRAB_UserDN' in kwargs['task']):
            const = 'TaskType =?= \"ROOT\" && CRAB_ReqName =?= "%s" && CRAB_UserDN =?= "%s"' % (task_name, userdn)
            cmd = "condor_qedit -const '%s' CRAB_JobCount %d" % (const, len(dagSpecs))
            self.logger.debug("+ %s" % cmd)
            status, output = commands.getstatusoutput(cmd)
            if status:
                self.logger.error(output)
                self.logger.error("Failed to record the number of jobs.")
                return 1

        return info, splitterResult

Example #8

Show file

File: DagmanCreator.py Project: emaszs/CRABServer

    def createSubdag(self, splitterResult, **kwargs):

        startjobid = kwargs.get('startjobid', 0)
        subjob = kwargs.get('subjob', None)
        stage = kwargs.get('stage', 'conventional')
        self.logger.debug('starting createSubdag, kwargs are:')
        self.logger.debug(str(kwargs))
        dagSpecs = []
        subdags = []

        if hasattr(self.config.TaskWorker, 'stageoutPolicy'):
            kwargs['task']['stageoutpolicy'] = ",".join(self.config.TaskWorker.stageoutPolicy)
        else:
            kwargs['task']['stageoutpolicy'] = "local,remote"

        ## In the future this parameter may be set by the user in the CRAB configuration
        ## file and we would take it from the Task DB.
        kwargs['task']['numautomjobretries'] = getattr(self.config.TaskWorker, 'numAutomJobRetries', 2)

        kwargs['task']['max_runtime'] = kwargs['task']['tm_split_args'].get('seconds_per_job', -1)
        if kwargs['task']['tm_split_algo'] == 'Automatic' and stage == 'conventional':
            kwargs['task']['max_runtime'] = getattr(self.config.TaskWorker, 'splittingPilotRuntime', 15 * 60)
            kwargs['task']['completion_jobs'] = getattr(self.config.TaskWorker, 'completionJobs', False)
            outfiles = []
            stage = 'probe'
        if stage == 'process' and not kwargs['task']['completion_jobs']:
            kwargs['task']['max_runtime'] = -1

        if stage == 'probe':
            parent = None
            startjobid = -1
        else:
            parent = startjobid

        info = self.makeJobSubmit(kwargs['task'])

        outfiles = kwargs['task']['tm_outfiles'] + kwargs['task']['tm_tfile_outfiles'] + kwargs['task']['tm_edm_outfiles']

        os.chmod("CMSRunAnalysis.sh", 0o755)

        # This config setting acts as a global black list
        global_blacklist = set(self.getBlacklistedSites())
        self.logger.debug("CRAB site blacklist: %s" % (list(global_blacklist)))

        # This is needed for Site Metrics
        # It should not block any site for Site Metrics and if needed for other activities
        # self.config.TaskWorker.ActivitiesToRunEverywhere = ['hctest', 'hcdev']
        # The other case where the blacklist is ignored is if the user sset this explicitly in his configuration
        if self.isGlobalBlacklistIgnored(kwargs) or (hasattr(self.config.TaskWorker, 'ActivitiesToRunEverywhere') and \
                   kwargs['task']['tm_activity'] in self.config.TaskWorker.ActivitiesToRunEverywhere):
            global_blacklist = set()
            self.logger.debug("Ignoring the CRAB site blacklist.")

        sitead = classad.ClassAd()
        siteinfo = {'group_sites': {}, 'group_datasites': {}}

        blocksWithNoLocations = set()
        blocksWithBannedLocations = set()
        allblocks = set()

        siteWhitelist = set(kwargs['task']['tm_site_whitelist'])
        siteBlacklist = set(kwargs['task']['tm_site_blacklist'])
        self.logger.debug("Site whitelist: %s" % (list(siteWhitelist)))
        self.logger.debug("Site blacklist: %s" % (list(siteBlacklist)))

        if siteWhitelist & global_blacklist:
            msg  = "The following sites from the user site whitelist are blacklisted by the CRAB server: %s." % (list(siteWhitelist & global_blacklist))
            msg += " Since the CRAB server blacklist has precedence, these sites are not considered in the user whitelist."
            self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname'])
            self.logger.warning(msg)

        if siteBlacklist & siteWhitelist:
            msg  = "The following sites appear in both the user site blacklist and whitelist: %s." % (list(siteBlacklist & siteWhitelist))
            msg += " Since the whitelist has precedence, these sites are not considered in the blacklist."
            self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname'])
            self.logger.warning(msg)

        ignoreLocality = kwargs['task']['tm_ignore_locality'] == 'T'
        self.logger.debug("Ignore locality: %s" % (ignoreLocality))

        for jobgroup in splitterResult[0]:
            self.logger.error(dir(jobgroup))
            self.logger.error(type(jobgroup))
            jobs = jobgroup.getJobs()

            jgblocks = set() #job group blocks
            for job in jobs:
                for inputfile in job['input_files']:
                    jgblocks.add(inputfile['block'])
                    allblocks.add(inputfile['block'])
            self.logger.debug("Blocks: %s" % list(jgblocks))

            if not jobs:
                locations = set()
            else:
                locations = set(jobs[0]['input_files'][0]['locations'])
            self.logger.debug("Locations: %s" % (list(locations)))

            ## Discard the jgblocks that have no locations. This can happen when a block is
            ## still open in PhEDEx. Newly created datasets from T0 (at least) have a large
            ## chance of having some block which is closed in DBS but not in PhEDEx.
            ## Open jgblocks in PhEDEx can have a location; it is WMCore who is returning no
            ## location.
            ## This is how a block is constructed during data taking:
            ## 1) an open block in T0 is injected in PhEDEx;
            ## 2) files are being added to the block in T0;
            ## 3) data are transferred by PhEDEx if a subscription is present;
            ## 4) once the block is finished:
            ##   a) the block is inserted into DBS as a closed block (before this, DBS has
            ##      no knowledge about the block);
            ##   b) block is closed in PhEDEx.
            if not locations and not ignoreLocality:
                blocksWithNoLocations = blocksWithNoLocations.union(jgblocks)
                continue

            if ignoreLocality:
                sbj = SiteDB.SiteDBJSON({"key": self.config.TaskWorker.cmskey,
                                         "cert": self.config.TaskWorker.cmscert})
                try:
                    possiblesites = set(sbj.getAllCMSNames())
                except Exception as ex:
                    msg  = "The CRAB3 server backend could not contact SiteDB to get the list of all CMS sites."
                    msg += " This could be a temporary SiteDB glitch."
                    msg += " Please try to submit a new task (resubmit will not work)"
                    msg += " and contact the experts if the error persists."
                    msg += "\nError reason: %s" % (str(ex)) #TODO add the sitedb url so the user can check themselves!
                    raise TaskWorker.WorkerExceptions.TaskWorkerException(msg)
            else:
                possiblesites = locations
            ## At this point 'possiblesites' should never be empty.
            self.logger.debug("Possible sites: %s" % (list(possiblesites)))

            ## Apply the global site blacklist.
            availablesites = possiblesites - global_blacklist

            ## See https://github.com/dmwm/CRABServer/issues/5241
            ## for a discussion about blocksWithBannedLocations
            if not availablesites:
                blocksWithBannedLocations = blocksWithBannedLocations.union(jgblocks)
                continue

            # NOTE: User can still shoot themselves in the foot with the resubmit blacklist
            # However, this is the last chance we have to warn the users about an impossible task at submit time.
            available = set(availablesites)
            if siteWhitelist:
                available &= siteWhitelist
                if not available:
                    blocksWithBannedLocations = blocksWithBannedLocations.union(jgblocks)
            available -= (siteBlacklist - siteWhitelist)
            if not available:
                blocksWithBannedLocations = blocksWithBannedLocations.union(jgblocks)
                continue

            availablesites = [str(i) for i in availablesites]
            datasites = jobs[0]['input_files'][0]['locations']
            self.logger.info("Resulting available sites: %s" % (list(availablesites)))

            if siteWhitelist or siteBlacklist:
                msg  = "The site whitelist and blacklist will be applied by the pre-job."
                msg += " This is expected to result in DESIRED_SITES = %s" % (list(available))
                self.logger.debug(msg)

            jobgroupDagSpecs, startjobid = self.makeDagSpecs(kwargs['task'], sitead, siteinfo, jobgroup, list(jgblocks)[0], availablesites, datasites, outfiles, startjobid, subjob=subjob, stage=stage)
            dagSpecs += jobgroupDagSpecs

        def getBlacklistMsg():
            if len(global_blacklist)!=0:
                tmp = " Global CRAB3 blacklist is %s.\n" % global_blacklist
            if len(siteBlacklist)!=0:
                tmp += " User blacklist is %s.\n" % siteBlacklist
            if len(siteWhitelist)!=0:
                tmp += " User whitelist is %s.\n" % siteWhitelist
            return tmp

        if not dagSpecs:
            msg = "No jobs created for task %s." % (kwargs['task']['tm_taskname'])
            if blocksWithNoLocations or blocksWithBannedLocations:
                msg  = "The CRAB server backend refuses to send jobs to the Grid scheduler. "
                msg += "No locations found for dataset '%s'. " % (kwargs['task']['tm_input_dataset'])
                msg += "(or at least for the part of the dataset that passed the lumi-mask and/or run-range selection).\n"
            if blocksWithBannedLocations:
                msg += " Found %s (out of %s) blocks present only at blacklisted sites." % (len(blocksWithBannedLocations), len(allblocks))
                msg += getBlacklistMsg()
            raise TaskWorker.WorkerExceptions.NoAvailableSite(msg)
        msg = "Some blocks from dataset '%s' were skipped " % (kwargs['task']['tm_input_dataset'])
        if blocksWithNoLocations:
            msgBlocklist = sorted(list(blocksWithNoLocations[:10])) + ['...']
            msg += " because they have no locations.\n List is (first 10 elements only): %s.\n" % msgBlocklist
        if blocksWithBannedLocations:
            msg += " because they are only present at blacklisted sites.\n List is: %s.\n" % (sorted(list(blocksWithBannedLocations)))
            msg += getBlacklistMsg()
        if blocksWithNoLocations or blocksWithBannedLocations:
            msg += (" Dataset processing will be incomplete because %s (out of %s) blocks are only present at blacklisted site(s)" %
                (len(blocksWithNoLocations)+len(blocksWithBannedLocations), len(allblocks)))
            self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname'])
            self.logger.warning(msg)

        ## Write down the DAG as needed by DAGMan.
        dag = DAG_HEADER.format(
                nodestate='' if not parent else '.{0}'.format(parent),
                resthost=kwargs['task']['resthost'],
                resturiwfdb=kwargs['task']['resturinoapi'] + '/workflowdb')
        if stage == 'probe':
            # We want only one probe job
            dagSpecs = dagSpecs[:1]
        for dagSpec in dagSpecs:
            dag += DAG_FRAGMENT.format(**dagSpec)
            if stage == 'probe' or (stage == 'process' and kwargs['task']['completion_jobs']):
                dag += SUBDAG_FRAGMENT.format(**dagSpec)
                subdag = "RunJobs{0}.subdag".format(dagSpec['count'])
                with open(subdag, "w") as fd:
                    fd.write("")
                subdags.append(subdag)

        ## Create a tarball with all the job lumi files.
        with getLock('splitting_data'):
            self.logger.debug("Acquired lock on run and lumi tarball")

            try:
                tempDir = tempfile.mkdtemp()
                tempDir2 = tempfile.mkdtemp()

                try:
                    tfd = tarfile.open('run_and_lumis.tar.gz', 'r:gz')
                    tfd.extractall(tempDir)
                    tfd.close()
                except (tarfile.ReadError, IOError):
                    self.logger.debug("First iteration: creating run and lumi from scratch")
                try:
                    tfd2 = tarfile.open('input_files.tar.gz', 'r:gz')
                    tfd2.extractall(tempDir2)
                    tfd2.close()
                except (tarfile.ReadError, IOError):
                    self.logger.debug("First iteration: creating inputfiles from scratch")
                tfd = tarfile.open('run_and_lumis.tar.gz', 'w:gz')
                tfd2 = tarfile.open('input_files.tar.gz', 'w:gz')
                for dagSpec in dagSpecs:
                    job_lumis_file = os.path.join(tempDir, 'job_lumis_'+ str(dagSpec['count']) +'.json')
                    with open(job_lumis_file, "w") as fd:
                        fd.write(str(dagSpec['runAndLumiMask']))
                    ## Also creating a tarball with the dataset input files.
                    ## Each .txt file in the tarball contains a list of dataset files to be used for the job.
                    job_input_file_list = os.path.join(tempDir2, 'job_input_file_list_'+ str(dagSpec['count']) +'.txt')
                    with open(job_input_file_list, "w") as fd2:
                        fd2.write(str(dagSpec['inputFiles']))
            finally:
                tfd.add(tempDir, arcname='')
                tfd.close()
                shutil.rmtree(tempDir)
                tfd2.add(tempDir2, arcname='')
                tfd2.close()
                shutil.rmtree(tempDir2)

        if stage in ('probe', 'conventional'):
            name = "RunJobs.dag"
            ## Cache data discovery
            with open("datadiscovery.pkl", "wb") as fd:
                pickle.dump(splitterResult[1], fd)

            ## Cache task information
            with open("taskinformation.pkl", "wb") as fd:
                pickle.dump(kwargs['task'], fd)
        else:
            name = "RunJobs{0}.subdag".format(parent)

        if stage != 'tail':
            ## Cache site information
            with open("site.ad", "w") as fd:
                fd.write(str(sitead))

            with open("site.ad.json", "w") as fd:
                json.dump(siteinfo, fd)

        ## Save the DAG into a file.
        with open(name, "w") as fd:
            fd.write(dag)

        task_name = kwargs['task'].get('CRAB_ReqName', kwargs['task'].get('tm_taskname', ''))
        userdn = kwargs['task'].get('CRAB_UserDN', kwargs['task'].get('tm_user_dn', ''))

        info["jobcount"] = len(dagSpecs)
        maxpost = getattr(self.config.TaskWorker, 'maxPost', 20)
        if maxpost == -1:
            maxpost = info['jobcount']
        elif maxpost == 0:
            maxpost = int(max(20, info['jobcount']*.1))
        info['maxpost'] = maxpost

        if info.get('faillimit') == None:
            info['faillimit'] = -1
            #if info['jobcount'] > 200
            #    info['faillimit'] = 100
            #else:
            #    info['faillimit'] = -1
        elif info.get('faillimit') < 0:
            info['faillimit'] = -1

        # Info for ML:
        target_se = ''
        max_len_target_se = 900
        for site in map(str, availablesites):
            if len(target_se) > max_len_target_se:
                target_se += ',Many_More'
                break
            if len(target_se):
                target_se += ','
            target_se += site
        ml_info = info.setdefault('apmon', [])
        shift = 0 if stage == 'probe' else 1
        for idx in range(shift, info['jobcount']+shift):
            taskid = kwargs['task']['tm_taskname']
            jinfo = {'broker': os.environ.get('HOSTNAME', ''),
                     'bossId': str(idx),
                     'TargetSE': target_se,
                     'localId': '',
                     'StatusValue': 'pending',
                    }
            insertJobIdSid(jinfo, idx, taskid, 0)
            ml_info.append(jinfo)

        # When running in standalone mode, we want to record the number of jobs in the task
        if ('CRAB_ReqName' in kwargs['task']) and ('CRAB_UserDN' in kwargs['task']):
            const = 'TaskType =?= \"ROOT\" && CRAB_ReqName =?= "%s" && CRAB_UserDN =?= "%s"' % (task_name, userdn)
            cmd = "condor_qedit -const '%s' CRAB_JobCount %d" % (const, len(dagSpecs))
            self.logger.debug("+ %s" % cmd)
            status, output = commands.getstatusoutput(cmd)
            if status:
                self.logger.error(output)
                self.logger.error("Failed to record the number of jobs.")
                return 1

        return info, splitterResult, subdags

Example #9

Show file

File: DagmanKiller.py Project: dciangot/CRABServer

    def killTransfers(self, apmon):
        self.logger.info("About to kill transfers from workflow %s." % self.workflow)
        asourl = self.task.get('tm_asourl', None)
        #let's default asodb to asynctransfer, for old task this is empty!
        #Probably tm_asodb is always there and the get is not necessary, but let's not assume this
        asodb = self.task.get('tm_asodb', 'asynctransfer') or 'asynctransfer'

        if not asourl:
            self.logger.info("ASO URL not set; will not kill transfers")
            return False

        try:
            hostname = socket.getfqdn()
        except:
            hostname = ''

        server = CMSCouch.CouchServer(dburl=asourl, ckey=self.proxy, cert=self.proxy)
        try:
            db = server.connectDatabase(asodb)
        except Exception as ex:
            msg =  "Error while connecting to asynctransfer CouchDB"
            self.logger.exception(msg)
            raise TaskWorkerException(msg)
        self.queryKill = {'reduce':False, 'key':self.workflow, 'include_docs': True}
        try:
            filesKill = db.loadView('AsyncTransfer', 'forKill', self.queryKill)['rows']
        except Exception as ex:
            msg =  "Error while connecting to asynctransfer CouchDB"
            self.logger.exception(msg)
            raise TaskWorkerException(msg)
        if len(filesKill) == 0:
            self.logger.warning('No files to kill found')
        for idt in filesKill:
            now = str(datetime.datetime.now())
            id = idt['value']
            data = {
                'end_time': now,
                'state': 'killed',
                'last_update': time.time(),
                'retry': now,
               }
            updateUri = "/%s/_design/AsyncTransfer/_update/updateJobs/%s?%s" % (db.name, id, urllib.urlencode(data))
            jobid = idt.get('jobid')
            jobretry = idt.get('job_retry_count')
            if not self.task['kill_all']:
                if idt.get("jobid") not in self.task['kill_ids']:
                    continue
            self.logger.info("Killing transfer %s (job ID %s; job retry %s)." % (id, str(jobid), str(jobretry)))
            jobid = str(jobid)
            jobretry = str(jobretry)
            if jobid and jobretry != None:
                jinfo = {'broker': hostname,
                         'bossId': jobid,
                         'StatusValue': 'killed',
                        }
                insertJobIdSid(jinfo, jobid, self.workflow, jobretry)
                self.logger.info("Sending kill info to Dashboard: %s" % str(jinfo))
                apmon.sendToML(jinfo)
            try:
                db.makeRequest(uri = updateUri, type = "PUT", decode = False)
            except Exception as ex:
                msg =  "Error updating document in couch"
                msg += str(ex)
                msg += str(traceback.format_exc())
                raise TaskWorkerException(msg)
        return True

Example #10

Show file

    def killTransfers(self, apmon):
        self.logger.info("About to kill transfers from workflow %s." % self.workflow)
        ASOURL = self.task.get('tm_asourl', None)
        if not ASOURL:
            self.logger.info("ASO URL not set; will not kill transfers")
            return False

        try:
            hostname = socket.getfqdn()
        except:
            hostname = ''

        server = CMSCouch.CouchServer(dburl=ASOURL, ckey=self.proxy, cert=self.proxy)
        try:
            db = server.connectDatabase('asynctransfer')
        except Exception as ex:
            msg =  "Error while connecting to asynctransfer CouchDB"
            self.logger.exception(msg)
            raise TaskWorkerException(msg)
        self.queryKill = {'reduce':False, 'key':self.workflow, 'include_docs': True}
        try:
            filesKill = db.loadView('AsyncTransfer', 'forKill', self.queryKill)['rows']
        except Exception as ex:
            msg =  "Error while connecting to asynctransfer CouchDB"
            self.logger.exception(msg)
            raise TaskWorkerException(msg)
        if len(filesKill) == 0:
            self.logger.warning('No files to kill found')
        for idt in filesKill:
            now = str(datetime.datetime.now())
            id = idt['value']
            data = {
                'end_time': now,
                'state': 'killed',
                'last_update': time.time(),
                'retry': now,
               }
            updateUri = "/%s/_design/AsyncTransfer/_update/updateJobs/%s?%s" % (db.name, id, urllib.urlencode(data))
            jobid = idt.get('jobid')
            jobretry = idt.get('job_retry_count')
            if not self.task['kill_all']:
                if idt.get("jobid") not in self.task['kill_ids']:
                    continue
            self.logger.info("Killing transfer %s (job ID %s; job retry %s)." % (id, str(jobid), str(jobretry)))
            jobid = str(jobid)
            jobretry = str(jobretry)
            if jobid and jobretry != None:
                jinfo = {'broker': hostname,
                         'bossId': jobid,
                         'StatusValue': 'killed',
                        }
                insertJobIdSid(jinfo, jobid, self.workflow, jobretry)
                self.logger.info("Sending kill info to Dashboard: %s" % str(jinfo))
                apmon.sendToML(jinfo)
            try:
                db.makeRequest(uri = updateUri, type = "PUT", decode = False)
            except Exception as ex:
                msg =  "Error updating document in couch"
                msg += str(ex)
                msg += str(traceback.format_exc())
                raise TaskWorkerException(msg)
        return True