Example #1
0
    def executeInternal(self, *args):
        """The executeInternal method return 4 if the "completion" threshold is not reached, 0 otherwise"""
        self.stage = args[0]
        self.completion = int(args[1])
        self.prefix = args[2]

        self.setupLog()

        self.statusCacheInfo = {} #Will be filled with the status from the status cache

        self.readJobStatus()
        completed = set(self.completedJobs(stage=self.stage))
        if len(completed) < self.completion:
            return 4

        self.readProcessedJobs()
        unprocessed = completed - self.processedJobs
        estimates = copy.copy(unprocessed)
        self.logger.info("jobs remaining to process: %s", ", ".join(sorted(unprocessed)))
        if self.stage == 'tail' and len(estimates-set(self.failedJobs)) == 0:
            estimates = set(self.completedJobs(stage='processing', processFailed=False))
        self.logger.info("jobs remaining to process: %s", ", ".join(sorted(unprocessed)))

        # The TaskWorker saves some files that now we are gonna read
        with open('datadiscovery.pkl', 'rb') as fd:
            dataset = pickle.load(fd) #Output from the discovery process
        with open('taskinformation.pkl', 'rb') as fd:
            task = pickle.load(fd) #A dictionary containing information about the task as in the Oracle DB
        with open('taskworkerconfig.pkl', 'rb') as fd:
            config = pickle.load(fd) #Task worker configuration

        # need to use user proxy as credential for talking with cmsweb
        config.TaskWorker.cmscert = os.environ.get('X509_USER_PROXY')
        config.TaskWorker.cmskey  = os.environ.get('X509_USER_PROXY')
        config.TaskWorker.envForCMSWEB = newX509env(X509_USER_CERT=config.TaskWorker.cmscert,
                                                         X509_USER_KEY=config.TaskWorker.cmskey)

        # need the global black list
        config.TaskWorker.scratchDir = './scratchdir'
        if not os.path.exists(config.TaskWorker.scratchDir):
            os.makedirs(config.TaskWorker.scratchDir)
        from TaskWorker.Actions.Recurring.BanDestinationSites import CRAB3BanDestinationSites
        banSites = CRAB3BanDestinationSites(config, 'dummy', 'dummy', self.logger)
        with config.TaskWorker.envForCMSWEB:
            banSites.execute()

        # Read the automatic_splitting/throughputs/0-N files where the PJ
        # saved the EventThroughput
        # (report['steps']['cmsRun']['performance']['cpu']['EventThroughput'])
        # and the average size of the output per event
        sumEventsThr = 0
        sumEventsSize = 0
        count = 0
        for jid in estimates:
            if jid in self.failedJobs:
                continue
            fn = "automatic_splitting/throughputs/{0}".format(jid)
            with open(fn) as fd:
                throughput, eventsize = json.load(fd)
                sumEventsThr += throughput
                sumEventsSize += eventsize
                count += 1
        eventsThr = sumEventsThr / count
        eventsSize = sumEventsSize / count

        self.logger.info("average throughput for %s jobs: %s evt/s", count, eventsThr)
        self.logger.info("average eventsize for %s jobs: %s bytes", count, eventsSize)

        maxSize = getattr(config.TaskWorker, 'automaticOutputSizeMaximum', 5 * 1000**3)
        maxEvents = (maxSize / eventsSize) if eventsSize > 0 else 0

        runtime = task['tm_split_args'].get('minutes_per_job', -1)
        if self.stage == "processing":
            # Build in a 33% error margin in the runtime to not create too
            # many tails. This essentially moves the peak to lower
            # runtimes and cuts off less of the job distribution tail.
            target = int(0.75 * runtime)
        elif self.stage == 'tail':
            target = int(max(
                getattr(config.TaskWorker, 'automaticTailRuntimeMinimumMins', 45),
                getattr(config.TaskWorker, 'automaticTailRuntimeFraction', 0.2) * runtime
            ))
        # `target` is in minutes, `eventsThr` is in events/second!
        events = int(target * eventsThr * 60)
        if events > maxEvents and maxEvents > 0:
            self.logger.info("reduced the target event count from %s to %s to obey output size", events, maxEvents)
            events = int(maxEvents)
        splitTask = dict(task)
        splitTask['tm_split_algo'] = 'EventAwareLumiBased'
        splitTask['tm_split_args']['events_per_job'] = events

        if self.stage == 'tail' and not self.adjustLumisForCompletion(splitTask, unprocessed):
            self.logger.info("nothing to process for completion")
            self.saveProcessedJobs(unprocessed)
            return 0

        # Disable retries for processing: every lumi is attempted to be
        # processed once in processing, thrice in the tails -> four times.
        # That should be enough "retries"
        #
        # See note in DagmanCreator about getting this from the Task DB
        if self.stage == "processing":
            config.TaskWorker.numAutomJobRetries = 0

        try:
            splitter = Splitter(config, server=None, resturi='')
            split_result = splitter.execute(dataset, task=splitTask)
            self.logger.info("Splitting results:")
            for g in split_result.result[0]:
                msg = "Created jobgroup with length {0}".format(len(g.getJobs()))
                self.logger.info(msg)
        except TaskWorkerException as e:
            retmsg = "Splitting failed with:\n{0}".format(e)
            self.logger.error(retmsg)
#            self.set_dashboard_state('FAILED')
            return 1
        try:
            parent = self.prefix if self.stage == 'tail' else None
            creator = DagmanCreator(config, server=None, resturi='')
            with config.TaskWorker.envForCMSWEB:
                creator.createSubdag(split_result.result, task=task, parent=parent, stage=self.stage)
            self.submitSubdag('RunJobs{0}.subdag'.format(self.prefix), getattr(config.TaskWorker, 'maxPost', 20), self.stage)
        except TaskWorkerException as e:
            retmsg = "DAG creation failed with:\n{0}".format(e)
            self.logger.error(retmsg)
#            self.set_dashboard_state('FAILED')
            return 1
        self.saveProcessedJobs(unprocessed)
        return 0
Example #2
0
    def executeInternal(self, *args):
        """The executeInternal method return 4 if the "completion" threshold is not reached, 0 otherwise"""
        self.stage = args[0]
        self.completion = int(args[1])
        self.prefix = args[2]

        self.setupLog()

        self.statusCacheInfo = {} #Will be filled with the status from the status cache

        self.readJobStatus()
        completed = set(self.completedJobs(stage=self.stage))
        if len(completed) < self.completion:
            return 4

        self.readProcessedJobs()
        unprocessed = completed - self.processedJobs
        estimates = copy.copy(unprocessed)
        self.logger.info("jobs remaining to process: {0}".format(", ".join(sorted(unprocessed))))
        if self.stage == 'tail' and len(estimates-set(self.failedJobs)) == 0:
            estimates = set(self.completedJobs(stage='processing'))
        self.logger.info("jobs remaining to process: %s", ", ".join(sorted(unprocessed)))

        # The TaskWorker saves some files that now we are gonna read
        with open('datadiscovery.pkl', 'rb') as fd:
            dataset = pickle.load(fd) #Output from the discovery process
        with open('taskinformation.pkl', 'rb') as fd:
            task = pickle.load(fd) #A dictionary containing information about the task as in the Oracle DB
        with open('taskworkerconfig.pkl', 'rb') as fd:
            config = pickle.load(fd) #Task worker configuration

        # Read the automatic_splitting/throughputs/0-N files where the PJ
        # saved the EventThroughput (report['steps']['cmsRun']['performance']['cpu']['EventThroughput'])
        sumEventsThr = 0
        count = 0
        for jid in estimates:
            if jid in self.failedJobs:
                continue
            fn = "automatic_splitting/throughputs/{0}".format(jid)
            with open(fn) as fd:
                sumEventsThr += float(fd.read())
                count += 1
        eventsThr = sumEventsThr / count
        self.logger.info("average throughput for %s jobs: %s", count, eventsThr)
        runtime = task['tm_split_args'].get('seconds_per_job', -1)
        if self.stage == "processing":
            # Build in a 33% error margin in the runtime to not create too
            # many tails. This essentially moves the peak to lower
            # runtimes and cuts off less of the job distribution tail.
            target = int(0.75 * runtime)
        elif self.stage == 'tail':
            target = int(max(
                getattr(config.TaskWorker, 'automaticTailRuntimeMinimum', 45 * 60),
                getattr(config.TaskWorker, 'automaticTailRuntimeFraction', 0.2) * runtime
            ))
        events = int(target * eventsThr)
        splitTask = dict(task)
        splitTask['tm_split_algo'] = 'EventAwareLumiBased'
        splitTask['tm_split_args']['events_per_job'] = events

        if self.stage == 'tail' and not self.adjustLumisForCompletion(splitTask, unprocessed):
            self.logger.info("nothing to process for completion")
            self.saveProcessedJobs(unprocessed)
            return 0

        # Disable retries for processing: every lumi is attempted to be
        # processed once in processing, thrice in the tails -> four times.
        # That should be enough "retries"
        #
        # See note in DagmanCreator about getting this from the Task DB
        if self.stage == "processing":
            config.TaskWorker.numAutomJobRetries = 0

        try:
            config.TaskWorker.scratchDir = './scratchdir' # XXX
            splitter = Splitter(config, server=None, resturi='')
            split_result = splitter.execute(dataset, task=splitTask)
            self.logger.info("Splitting results:")
            for g in split_result.result[0]:
                msg = "Created jobgroup with length {0}".format(len(g.getJobs()))
                self.logger.info(msg)
        except TaskWorkerException as e:
            retmsg = "Splitting failed with:\n{0}".format(e)
            self.logger.error(msg)
#            self.set_dashboard_state('FAILED')
            return 1
        try:
            parent = self.prefix if self.stage == 'tail' else None
            creator = DagmanCreator(config, server=None, resturi='')
            creator.createSubdag(split_result.result, task=task, parent=parent, stage=self.stage)
            self.submitSubdag('RunJobs{0}.subdag'.format(self.prefix), getattr(config.TaskWorker, 'maxPost', 20), self.stage)
        except TaskWorkerException as e:
            retmsg = "DAG creation failed with:\n{0}".format(e)
            self.logger.error(retmsg)
#            self.set_dashboard_state('FAILED')
            return 1
        self.saveProcessedJobs(unprocessed)
        return 0
Example #3
0
def handleNewTask(resthost, resturi, config, task, procnum, *args, **kwargs):
    """Performs the injection of a new task

    :arg str resthost: the hostname where the rest interface is running
    :arg str resturi: the rest base url to contact
    :arg WMCore.Configuration config: input configuration
    :arg TaskWorker.DataObjects.Task task: the task to work on
    :arg int procnum: the process number taking care of the work
    :*args and *kwargs: extra parameters currently not defined
    :return: the handler."""
    server = HTTPRequests(resthost,
                          config.TaskWorker.cmscert,
                          config.TaskWorker.cmskey,
                          retry=20,
                          logger=logging.getLogger(str(procnum)))
    handler = TaskHandler(task,
                          procnum,
                          server,
                          config,
                          'handleNewTask',
                          createTempDir=True)
    handler.addWork(
        MyProxyLogon(config=config,
                     server=server,
                     resturi=resturi,
                     procnum=procnum,
                     myproxylen=60 * 60 * 24))
    handler.addWork(
        StageoutCheck(config=config,
                      server=server,
                      resturi=resturi,
                      procnum=procnum))
    if task['tm_job_type'] == 'Analysis':
        if task.get('tm_user_files'):
            handler.addWork(
                UserDataDiscovery(config=config,
                                  server=server,
                                  resturi=resturi,
                                  procnum=procnum))
        else:
            handler.addWork(
                DBSDataDiscovery(config=config,
                                 server=server,
                                 resturi=resturi,
                                 procnum=procnum))
    elif task['tm_job_type'] == 'PrivateMC':
        handler.addWork(
            MakeFakeFileSet(config=config,
                            server=server,
                            resturi=resturi,
                            procnum=procnum))
    handler.addWork(
        Splitter(config=config,
                 server=server,
                 resturi=resturi,
                 procnum=procnum))
    handler.addWork(
        DagmanCreator(config=config,
                      server=server,
                      resturi=resturi,
                      procnum=procnum))
    if task['tm_dry_run'] == 'T':
        handler.addWork(
            DryRunUploader(config=config,
                           server=server,
                           resturi=resturi,
                           procnum=procnum))
    else:
        handler.addWork(
            DagmanSubmitter(config=config,
                            server=server,
                            resturi=resturi,
                            procnum=procnum))

    return handler.actionWork(args, kwargs)
Example #4
0
    def executeInternal(self, *args):
        """The executeInternal method return 4 if the "completion" threshold is not reached, 0 otherwise"""
        self.stage = args[0]
        self.completion = int(args[1])
        self.prefix = args[2]

        self.setupLog()

        self.statusCacheInfo = {
        }  #Will be filled with the status from the status cache

        self.readJobStatus()
        completed = set(self.completedJobs(stage=self.stage))
        if len(completed) < self.completion:
            return 4

        self.readProcessedJobs()
        unprocessed = completed - self.processedJobs
        estimates = copy.copy(unprocessed)
        self.logger.info("jobs remaining to process: %s",
                         ", ".join(sorted(unprocessed)))
        if self.stage == 'tail' and len(estimates - set(self.failedJobs)) == 0:
            estimates = set(
                self.completedJobs(stage='processing', processFailed=False))
        self.logger.info("jobs remaining to process: %s",
                         ", ".join(sorted(unprocessed)))

        # The TaskWorker saves some files that now we are gonna read
        with open('datadiscovery.pkl', 'rb') as fd:
            dataset = pickle.load(fd)  #Output from the discovery process
        with open('taskinformation.pkl', 'rb') as fd:
            task = pickle.load(
                fd
            )  #A dictionary containing information about the task as in the Oracle DB
        with open('taskworkerconfig.pkl', 'rb') as fd:
            config = pickle.load(fd)  #Task worker configuration

        # need to use user proxy as credential for talking with cmsweb
        config.TaskWorker.cmscert = os.environ.get('X509_USER_PROXY')
        config.TaskWorker.cmskey = os.environ.get('X509_USER_PROXY')

        # need the global black list
        config.TaskWorker.scratchDir = './scratchdir'
        if not os.path.exists(config.TaskWorker.scratchDir):
            os.makedirs(config.TaskWorker.scratchDir)
        from TaskWorker.Actions.Recurring.BanDestinationSites import CRAB3BanDestinationSites
        banSites = CRAB3BanDestinationSites(config, 'dummy', 'dummy',
                                            self.logger)
        banSites.execute()

        # Read the automatic_splitting/throughputs/0-N files where the PJ
        # saved the EventThroughput
        # (report['steps']['cmsRun']['performance']['cpu']['EventThroughput'])
        # and the average size of the output per event
        sumEventsThr = 0
        sumEventsSize = 0
        count = 0
        for jid in estimates:
            if jid in self.failedJobs:
                continue
            fn = "automatic_splitting/throughputs/{0}".format(jid)
            with open(fn) as fd:
                throughput, eventsize = json.load(fd)
                sumEventsThr += throughput
                sumEventsSize += eventsize
                count += 1
        eventsThr = sumEventsThr / count
        eventsSize = sumEventsSize / count

        self.logger.info("average throughput for %s jobs: %s", count,
                         eventsThr)
        self.logger.info("average eventsize for %s jobs: %s", count,
                         eventsSize)

        maxSize = getattr(config.TaskWorker, 'automaticOutputSizeMaximum',
                          5 * 1000**3)
        maxEvents = (maxSize / eventsSize) if eventsSize > 0 else 0

        runtime = task['tm_split_args'].get('minutes_per_job', -1)
        if self.stage == "processing":
            # Build in a 33% error margin in the runtime to not create too
            # many tails. This essentially moves the peak to lower
            # runtimes and cuts off less of the job distribution tail.
            target = int(0.75 * runtime)
        elif self.stage == 'tail':
            target = int(
                max(
                    getattr(config.TaskWorker,
                            'automaticTailRuntimeMinimumMins', 45),
                    getattr(config.TaskWorker, 'automaticTailRuntimeFraction',
                            0.2) * runtime))
        # `target` is in minutes, `eventsThr` is in events/second!
        events = int(target * eventsThr * 60)
        if events > maxEvents and maxEvents > 0:
            self.logger.info(
                "reduced the target event count from %s to %s to obey output size",
                events, maxEvents)
            events = int(maxEvents)
        splitTask = dict(task)
        splitTask['tm_split_algo'] = 'EventAwareLumiBased'
        splitTask['tm_split_args']['events_per_job'] = events

        if self.stage == 'tail' and not self.adjustLumisForCompletion(
                splitTask, unprocessed):
            self.logger.info("nothing to process for completion")
            self.saveProcessedJobs(unprocessed)
            return 0

        # Disable retries for processing: every lumi is attempted to be
        # processed once in processing, thrice in the tails -> four times.
        # That should be enough "retries"
        #
        # See note in DagmanCreator about getting this from the Task DB
        if self.stage == "processing":
            config.TaskWorker.numAutomJobRetries = 0

        try:
            splitter = Splitter(config, server=None, resturi='')
            split_result = splitter.execute(dataset, task=splitTask)
            self.logger.info("Splitting results:")
            for g in split_result.result[0]:
                msg = "Created jobgroup with length {0}".format(
                    len(g.getJobs()))
                self.logger.info(msg)
        except TaskWorkerException as e:
            retmsg = "Splitting failed with:\n{0}".format(e)
            self.logger.error(retmsg)
            #            self.set_dashboard_state('FAILED')
            return 1
        try:
            parent = self.prefix if self.stage == 'tail' else None
            creator = DagmanCreator(config, server=None, resturi='')
            creator.createSubdag(split_result.result,
                                 task=task,
                                 parent=parent,
                                 stage=self.stage)
            self.submitSubdag('RunJobs{0}.subdag'.format(self.prefix),
                              getattr(config.TaskWorker, 'maxPost', 20),
                              self.stage)
        except TaskWorkerException as e:
            retmsg = "DAG creation failed with:\n{0}".format(e)
            self.logger.error(retmsg)
            #            self.set_dashboard_state('FAILED')
            return 1
        self.saveProcessedJobs(unprocessed)
        return 0