Ejemplo n.º 1
0
def processWorker(inputs, results, resthost, resturi):
    """Wait for an reference to appear in the input queue, call the referenced object
       and write the output in the output queue.

       :arg Queue inputs: the queue where the inputs are shared by the master
       :arg Queue results: the queue where this method writes the output
       :return: default returning zero, but not really needed."""
    logger = logging.getLogger()
    procName = multiprocessing.current_process().name
    while True:
        try:
            workid, work, task, inputargs = inputs.get()
        except (EOFError, IOError):
            crashMessage = "Hit EOF/IO in getting new work\n"
            crashMessage += "Assuming this is a graceful break attempt.\n"
            logger.error(crashMessage)
            break
        if work == 'STOP':
            break

        outputs = None
        t0 = time.time()
        logger.debug("%s: Starting %s on %s" %(procName, str(work), task['tm_taskname']))
        try:
            msg = None
            outputs = work(resthost, resturi, WORKER_CONFIG, task, inputargs)
        except WorkerHandlerException, we:
            outputs = Result(task=task, err=str(we))
            msg = str(we)
        except Exception, exc:
            outputs = Result(task=task, err=str(exc))
            msg = "%s: I just had a failure for %s" % (procName, str(exc))
            msg += "\n\tworkid=" + str(workid)
            msg += "\n\ttask=" + str(task['tm_taskname'])
            msg += "\n" + str(traceback.format_exc())
Ejemplo n.º 2
0
 def execute(self, *args, **kwargs):
     results = []
     for jgroup in args[0]:
         possiblesites = jgroup.jobs[0]['input_files'][0]['locations']
         self.logger.debug("Possible sites == " + str(possiblesites))
         if len(possiblesites) == 0:
             msg = "DLS retourned no sites for the block"
             self.logger.error(msg)
             results.append(
                 Result(task=kwargs['task'],
                        result=(jgroup, None, []),
                        err=msg))
             continue
         #use resubmiti white/black lists if we have them
         siteWhitelist = kwargs['task']['tm_site_whitelist'] if not kwargs[
             'task']['resubmit_site_whitelist'] else kwargs['task'][
                 'resubmit_site_whitelist']
         siteBlacklist = kwargs['task']['tm_site_blacklist'] if not kwargs[
             'task']['resubmit_site_blacklist'] else kwargs['task'][
                 'resubmit_site_blacklist']
         self.logger.debug("white list == %s" % set(siteWhitelist))
         self.logger.debug("black list == %s" % set(siteBlacklist))
         availablesites = list(
             set(possiblesites)
             & set(siteWhitelist) if siteWhitelist else set(possiblesites) -
             set(siteBlacklist))
         self.logger.info('Available sites == %s' % str(availablesites))
         fixedsites = set(self.config.Sites.available)
         availablesites = list(set(availablesites) & fixedsites)
         if len(availablesites) == 0:
             msg = "No site available before brokering, will skip injection. Check White/Back lists"
             self.logger.error(msg)
             results.append(
                 Result(task=kwargs['task'],
                        result=(jgroup, None, []),
                        err=msg))
             continue
         self.logger.info(
             "Asking best site to PanDA between %s. Using %s as pandaserver."
             % (str(availablesites), self.pandaurls['baseURLSSL']))
         selectedsite = runBrokerage(
             self.pandaurls['baseURLSSL'],
             proxy=kwargs['task']['user_proxy'],
             sites=self.translateSiteName(availablesites))[-1]
         self.logger.info("Choosed site after brokering " +
                          str(selectedsite))
         if not selectedsite:
             msg = "No site available after brokering, will skip injection"
             self.logger.error(msg)
             results.append(
                 Result(task=kwargs['task'],
                        result=(jgroup, None, []),
                        err=msg))
             continue
         else:
             results.append(
                 Result(task=kwargs['task'],
                        result=(jgroup, selectedsite, availablesites)))
     return results
Ejemplo n.º 3
0
 def execute(self, resthost, resturi, config, task, procnum):
     try:
         self.logger.info("Executing %s" % task)
         self._execute(resthost, resturi, config, task)
         return Result(task=task['tm_taskname'], result="OK")
     except Exception as ex:
         self.logger.error("Error while runnig recurring action.")
         self.logger.exception(ex)
         return Result(task=task['tm_taskname'], result="KO")
 def execute(self, resthost, dbinstance, config, task, procnum):
     try:
         self.logger.info("Executing %s", task)
         self._execute(config, task)
         return Result(task=task['tm_taskname'], result="OK")
     except Exception as ex:
         self.logger.error("Error while runnig recurring action.")
         self.logger.exception(ex)
         return Result(task=task['tm_taskname'], err="RecurringAction FAILED")
Ejemplo n.º 5
0
 def execute(self, *args, **kwargs):
     self.logger.info("Getting already existing specs ")
     status, pandaspecs = getFullJobStatus(
         self.backendurls['baseURLSSL'],
         ids=kwargs['task']['resubmit_ids'],
         proxy=kwargs['task']['user_proxy'])
     return Result(task=kwargs['task'], result=pandaspecs)
Ejemplo n.º 6
0
    def executeInternal(self, *args, **kw):
        tempDir = args[0][0]
        inputFiles = args[0][3]
        splitterResult = args[0][4]

        cwd = os.getcwd()
        try:
            os.chdir(tempDir)
            splittingSummary = SplittingSummary(kw['task']['tm_split_algo'])
            for jobgroup in splitterResult:
                jobs = jobgroup.getJobs()
                splittingSummary.addJobs(jobs)
            splittingSummary.dump('splitting-summary.json')
            inputFiles.append('splitting-summary.json')

            self.packSandbox(inputFiles)

            self.logger.info('Uploading dry run tarball to the user file cache')
            ufc = UserFileCache(dict={'cert': kw['task']['user_proxy'], 'key': kw['task']['user_proxy'], 'endpoint': kw['task']['tm_cache_url']})
            result = ufc.uploadLog('dry-run-sandbox.tar.gz')
            os.remove('dry-run-sandbox.tar.gz')
            if 'hashkey' not in result:
                raise TaskWorkerException('Failed to upload dry-run-sandbox.tar.gz to the user file cache: ' + str(result))
            else:
                self.logger.info('Uploaded dry run tarball to the user file cache: ' + str(result))
                update = {'workflow': kw['task']['tm_taskname'], 'subresource': 'state', 'status': 'UPLOADED'}
                self.logger.debug('Updating task status: %s' % str(update))
                self.server.post(self.resturi, data=urllib.urlencode(update))

        finally:
            os.chdir(cwd)

        return Result(task=kw['task'], result=args[0])
Ejemplo n.º 7
0
    def execute(self, *args, **kwargs):

        wmwork = Workflow(name=kwargs['task']['tm_taskname'])

        wmsubs = Subscription(fileset=args[0], workflow=wmwork,
                               split_algo=kwargs['task']['tm_split_algo'],
                               type=self.jobtypeMapper[kwargs['task']['tm_job_type']])
        splitter = SplitterFactory()
        jobfactory = splitter(subscription=wmsubs)
        splitparam = kwargs['task']['tm_split_args']
        splitparam['algorithm'] = kwargs['task']['tm_split_algo']
        factory = jobfactory(**splitparam)
        if len(factory) == 0:
            # Understanding that no jobs could be created given the splitting arguments
            # with the given input dataset information: NO IDEA WHY.
            # NB: we assume that split can't happen, then task is failed
            msg = "Splitting %s on %s with %s does not generate any job" %(kwargs['task']['tm_taskname'],
                                                                           kwargs['task']['tm_input_dataset'],
                                                                           kwargs['task']['tm_split_algo'])
            self.logger.error("Setting %s as failed" % str(kwargs['task']['tm_taskname']))
            configreq = {'workflow': kwargs['task']['tm_taskname'],
                         'status': "FAILED",
                         'subresource': 'failure',
                         'failure': b64encode(msg)}
            self.server.post(self.resturl, data = urllib.urlencode(configreq))
            raise StopHandler(msg)
        return Result(task=kwargs['task'], result=factory)
Ejemplo n.º 8
0
    def execute(self, *args, **kwargs):  #pylint: disable=unused-argument

        # since https://github.com/dmwm/CRABServer/issues/5633 totalunits can be a float
        # but that would confuse WMCore, therefore cast to int
        totalevents = int(kwargs['task']['tm_totalunits'])
        firstEvent = 1
        lastEvent = totalevents
        firstLumi = 1
        lastLumi = 10

        # Set a default of 100 events per lumi.  This is set as a task
        # property, as the splitting considers it independently of the file
        # information provided by the fake dataset.
        if not kwargs['task']['tm_events_per_lumi']:
            kwargs['task']['tm_events_per_lumi'] = 100

        #MC comes with only one MCFakeFile
        singleMCFileset = Fileset(name="MCFakeFileSet")
        newFile = File("MCFakeFile", size=1000, events=totalevents)
        newFile.setLocation(self.getListOfSites())
        newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1)))
        newFile["block"] = 'MCFakeBlock'
        newFile["first_event"] = firstEvent
        newFile["last_event"] = lastEvent
        singleMCFileset.addFile(newFile)

        return Result(task=kwargs['task'], result=singleMCFileset)
Ejemplo n.º 9
0
 def execute(self, *args, **kwargs):
     result = None
     proxycfg = {'vo': kwargs['task']['tm_user_vo'],
                 'logger': self.logger,
                 'myProxySvr': self.config.Services.MyProxy,
                 'proxyValidity' : '144:0',
                 'min_time_left' : 36000, ## do we need this ? or should we use self.myproxylen? 
                 'userDN' : kwargs['task']['tm_user_dn'],
                 'group' : kwargs['task']['tm_user_group'] if kwargs['task']['tm_user_group'] else '',
                 'role' : kwargs['task']['tm_user_role'] if kwargs['task']['tm_user_role'] else '',
                 'server_key': self.config.MyProxy.serverhostkey,
                 'server_cert': self.config.MyProxy.serverhostcert,
                 'serverDN': self.config.MyProxy.serverdn,
                 'uisource': getattr(self.config.MyProxy, 'uisource', ''),
                 'credServerPath': self.config.MyProxy.credpath,
                 'myproxyAccount' : self.server['host'],
                 'cleanEnvironment' : getattr(self.config.MyProxy, 'cleanEnvironment', False)
                }
     proxy = Proxy(proxycfg)
     userproxy = proxy.getProxyFilename(serverRenewer=True)
     proxy.logonRenewMyProxy()
     timeleft = proxy.getTimeLeft(userproxy)
     if timeleft is None or timeleft <= 0:
         msg = "Impossible to retrieve proxy from %s for %s." % (proxycfg['myProxySvr'], proxycfg['userDN'])
         raise TaskWorkerException(msg)
     else:
         kwargs['task']['user_proxy'] = userproxy
         result = Result(task=kwargs['task'], result='OK')
     return result
Ejemplo n.º 10
0
    def execute(self, *args, **kwargs):

        totalevents = kwargs['task']['tm_totalunits']
        firstEvent = 1
        lastEvent = totalevents
        firstLumi = 1
        lastLumi = 10

        # Set a default of 100 events per lumi.  This is set as a task
        # property, as the splitting considers it independently of the file
        # information provided by the fake dataset.
        if not kwargs['task']['tm_events_per_lumi']:
            kwargs['task']['tm_events_per_lumi'] = 100

        #MC comes with only one MCFakeFile
        singleMCFileset = Fileset(name = "MCFakeFileSet")
        newFile = File("MCFakeFile", size = 1000, events = totalevents)
        if hasattr(self.config.Sites, 'available'):
            newFile.setLocation(self.config.Sites.available)
        else:
            sbj = SiteDBJSON({"key":self.config.TaskWorker.cmskey,
                              "cert":self.config.TaskWorker.cmscert})
            newFile.setLocation(sbj.getAllCMSNames())
        newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1)))
        newFile["block"] = 'MCFackBlock'
        newFile["first_event"] = firstEvent
        newFile["last_event"] = lastEvent
        singleMCFileset.addFile(newFile)

        return Result(task=kwargs['task'], result=singleMCFileset)
Ejemplo n.º 11
0
class Splitter(TaskAction):
    """Performing the split operation depending on the
       recevied input and arguments"""
    def execute(self, *args, **kwargs):
        wmwork = Workflow(name=kwargs['task']['tm_taskname'])

        wmsubs = Subscription(
            fileset=args[0],
            workflow=wmwork,
            split_algo=kwargs['task']['tm_split_algo'],
            type=self.jobtypeMapper[kwargs['task']['tm_job_type']])
        splitter = SplitterFactory()
        jobfactory = splitter(subscription=wmsubs)
        splitparam = kwargs['task']['tm_split_args']
        splitparam['algorithm'] = kwargs['task']['tm_split_algo']
        if kwargs['task']['tm_job_type'] == 'Analysis':
            if kwargs['task']['tm_split_algo'] == 'FileBased':
                splitparam['total_files'] = kwargs['task']['tm_totalunits']
            elif kwargs['task']['tm_split_algo'] == 'LumiBased':
                splitparam['total_lumis'] = kwargs['task']['tm_totalunits']
        elif kwargs['task']['tm_job_type'] == 'PrivateMC':
            if 'tm_events_per_lumi' in kwargs['task'] and kwargs['task'][
                    'tm_events_per_lumi']:
                splitparam['events_per_lumi'] = kwargs['task'][
                    'tm_events_per_lumi']
            if 'tm_generator' in kwargs['task'] and kwargs['task'][
                    'tm_generator'] == 'lhe':
                splitparam['lheInputFiles'] = True
        splitparam['applyLumiCorrection'] = True
        factory = jobfactory(**splitparam)
        if len(factory) == 0:
            raise TaskWorkerException("The CRAB3 server backend could not submit any job to the Grid scheduler:\n"+\
                        "splitting task %s on dataset %s with %s method does not generate any job")
        #printing duplicated lumis if any
        lumiChecker = getattr(jobfactory, 'lumiChecker', None)
        if lumiChecker and lumiChecker.splitLumiFiles:
            self.logger.warning(
                "The input dataset contains the following duplicated lumis %s"
                % lumiChecker.splitLumiFiles.keys())
            try:
                configreq = {
                    'subresource':
                    'addwarning',
                    'workflow':
                    kwargs['task']['tm_taskname'],
                    'warning':
                    b64encode(
                        'The CRAB3 server backend detected lumis split across files in the input dataset.'
                        ' Will apply the necessary corrections in the splitting algorithms'
                    )
                }
                self.server.post(self.restURInoAPI + '/task',
                                 data=urllib.urlencode(configreq))
            except Exception, e:
                self.logger.error(e.headers)
                self.logger.warning(
                    "Cannot add warning to REST after finding duplicates")

        return Result(task=kwargs['task'], result=factory)
Ejemplo n.º 12
0
    def executeInternal(self, *args, **kw):
        inputFiles = args[0][2]
        splitterResult = args[0][3][0]

        cwd = os.getcwd()
        try:
            os.chdir(kw['tempDir'])
            splittingSummary = SplittingSummary(kw['task']['tm_split_algo'])
            for jobgroup in splitterResult:
                jobs = jobgroup.getJobs()
                splittingSummary.addJobs(jobs)
            splittingSummary.dump('splitting-summary.json')
            inputFiles.append('splitting-summary.json')

            self.packSandbox(inputFiles)

            self.logger.info(
                'Uploading dry run tarball to the user file cache')
            if 'S3' in kw['task']['tm_cache_url'].upper():
                uploadToS3(crabserver=self.crabserver,
                           filepath='dry-run-sandbox.tar.gz',
                           objecttype='runtimefiles',
                           taskname=kw['task']['tm_taskname'],
                           logger=self.logger)
                result = {
                    'hashkey': 'ok'
                }  # a dummy one to keep same semantics as when using UserFileCache
                os.remove('dry-run-sandbox.tar.gz')
            else:
                ufc = UserFileCache(
                    mydict={
                        'cert': kw['task']['user_proxy'],
                        'key': kw['task']['user_proxy'],
                        'endpoint': kw['task']['tm_cache_url']
                    })
                result = ufc.uploadLog('dry-run-sandbox.tar.gz')
                os.remove('dry-run-sandbox.tar.gz')
            if 'hashkey' not in result:
                raise TaskWorkerException(
                    'Failed to upload dry-run-sandbox.tar.gz to the user file cache: '
                    + str(result))
            self.logger.info(
                'Uploaded dry run tarball to the user file cache: %s',
                str(result))
            update = {
                'workflow': kw['task']['tm_taskname'],
                'subresource': 'state',
                'status': 'UPLOADED'
            }
            self.logger.debug('Updating task status: %s', str(update))
            self.crabserver.post(api='workflowdb',
                                 data=urllib.urlencode(update))

        finally:
            os.chdir(cwd)

        return Result(task=kw['task'], result=args[0])
Ejemplo n.º 13
0
    def execute(self, *args, **kwargs):
        """ This Action does something useful in case the user did not specify a lumi mask. In this case the report command is meaningless as
            it will never report if there are unanalyzed lumis. So, we build a lumimask starting from the infos coming from the DBS discovery
            and we push it to the crab REST interface.
        """
        files = args[0]

        if not kwargs['task']['tm_split_args']['lumis'] and not kwargs['task'][
                'tm_split_args']['runs']:
            self.logger.info(
                "Reconstructing lumimask as the user did not specify it")
            lumilists = {}
            self.runs = []
            self.lumis = []

            #Take all the files and create one dict containing all the lumi to analyze
            for f in files:
                for run in f['runs']:
                    if run.run not in lumilists:
                        lumilists[run.run] = run.lumis
                    else:
                        lumilists[run.run] += run.lumis

            self.logger.debug("Lumilist reconstructed: %s" % lumilists)

            #Take the dict containing the lumilist (format: {run1 : [lumi1, lumi2 ...], run2 : [lumi1, lumi2 ...] ...}),
            #group the lumis in the same range (1,2,3,4 => [1,4]) and prepare the runs and lumis as expected by the REST. Example:
            #Input:
            #lumilist = {2L: [1L, 2L, 3L, 8L, 9L, 4L, 5L, 20L, 21L, 22L], 3L: [11L, 12L, 13L], 4L: [1L, 2L, 5L, 6L, 7L, 100L]}
            #Output:
            #runs = [2', '3', '4']
            #lumis = ['1,5,8,9,20,22', '11,13', '1,2,5,7,100,100']
            for run in lumilists:
                self.runs.append(str(run))
                self.lumis.append(','.join([
                    (lambda currLumi=consLumis.next(), numConsLumi=sum(
                        1 for _ in consLumis): "%s,%s" %
                     (currLumi, currLumi + numConsLumi))()
                    for _, consLumis in groupby(
                        sorted(lumilists[run]),
                        lambda x, c=count(): c.next() - x)
                ]))

            configreq = {
                'workflow': kwargs['task']['tm_taskname'],
                'subresource': 'lumimask',
            }
            self.server.post(self.resturl,
                             data=urllib.urlencode(configreq) +
                             '&runs='.join([''] + self.runs) +
                             '&lumis='.join([''] + self.lumis))

        result = Result(task=kwargs['task'], result=files)
        return result
Ejemplo n.º 14
0
 def execute(self, *args, **kwargs):
     result = None
     proxycfg = {
         'vo':
         kwargs['task']['tm_user_vo'],
         'logger':
         self.logger,
         'myProxySvr':
         self.config.Services.MyProxy,
         'proxyValidity':
         '24:0',
         'min_time_left':
         36000,  ## do we need this ? or should we use self.myproxylen?
         'userDN':
         kwargs['task']['tm_user_dn'],
         'group':
         kwargs['task']['tm_user_group']
         if kwargs['task']['tm_user_group'] else '',
         'role':
         kwargs['task']['tm_user_role']
         if kwargs['task']['tm_user_role'] else '',
         'server_key':
         self.config.MyProxy.serverhostkey,
         'server_cert':
         self.config.MyProxy.serverhostcert,
         'serverDN':
         self.config.MyProxy.serverdn,
         'uisource':
         self.config.MyProxy.uisource,
         'credServerPath':
         self.config.MyProxy.credpath,
     }
     proxy = Proxy(proxycfg)
     userproxy = proxy.getProxyFilename(serverRenewer=True)
     proxy.logonRenewMyProxy()
     timeleft = proxy.getTimeLeft(userproxy)
     if timeleft is None or timeleft <= 0:
         msg = "Impossible to retrieve proxy from %s for %s." % (
             proxycfg['myProxySvr'], proxycfg['userDN'])
         self.logger.error("Setting %s as failed" %
                           str(kwargs['task']['tm_taskname']))
         configreq = {
             'workflow': kwargs['task']['tm_taskname'],
             'status': "FAILED",
             'subresource': 'failure',
             'failure': b64encode(msg)
         }
         self.logger.error(str(configreq))
         self.server.post(self.resturl, data=urllib.urlencode(configreq))
         raise StopHandler(msg)
     else:
         kwargs['task']['user_proxy'] = userproxy
         result = Result(task=kwargs['task'], result='OK')
     return result
Ejemplo n.º 15
0
    def execute(self, *args, **kwargs):
        self.logger.info(
            "Data discovery and splitting for %s using user-provided files" %
            kwargs['task']['tm_taskname'])

        userfiles = kwargs['task']['tm_arguments'].get('userfiles')
        splitting = kwargs['task']['tm_split_algo']
        total_units = kwargs['task']['tm_totalunits']
        if not userfiles or splitting != 'FileBased':
            if not userfiles:
                msg = "No files specified to process for task %s." % kwargs[
                    'task']['tm_taskname']
            if splitting != 'FileBased':
                msg = "Data.splitting must be set to 'FileBased' when using a custom set of files."
            self.logger.error("Setting %s as failed: %s" %
                              (kwargs['task']['tm_taskname'], msg))
            configreq = {
                'workflow': kwargs['task']['tm_taskname'],
                'status': "FAILED",
                'subresource': 'failure',
                'failure': b64encode(msg)
            }
            self.server.post(self.resturi, data=urllib.urlencode(configreq))
            raise StopHandler(msg)

        if hasattr(self.config.Sites, 'available'):
            locations = self.config.Sites.available
        else:
            sbj = SiteDBJSON({
                "key": self.config.TaskWorker.cmskey,
                "cert": self.config.TaskWorker.cmscert
            })
            locations = sbj.getAllCMSNames()

        userFileset = Fileset(name=kwargs['task']['tm_taskname'])
        self.logger.info("There are %d files specified by the user." %
                         len(userfiles))
        if total_units > 0:
            self.logger.info("Will run over the first %d files." % total_units)
        file_counter = 0
        for userfile, idx in zip(userfiles, range(len(userfiles))):
            newFile = File(userfile, size=1000, events=1)
            newFile.setLocation(locations)
            newFile.addRun(Run(1, idx))
            newFile["block"] = 'UserFilesFakeBlock'
            newFile["first_event"] = 1
            newFile["last_event"] = 2
            userFileset.addFile(newFile)
            file_counter += 1
            if total_units > 0 and file_counter >= total_units:
                break

        return Result(task=kwargs['task'], result=userFileset)
Ejemplo n.º 16
0
class DataDiscovery(TaskAction):
    """I am the abstract class for the data discovery.
       Taking care of generalizing different data discovery
       possibilities. Implementing only a common method to
       return a properly formatted output."""

    def formatOutput(self, task, requestname, datasetfiles, locations):
        """Receives as input the result of the data location
           discovery operations and fill up the WMCore objects."""
        self.logger.debug(" Formatting data discovery output ")
        # TEMPORARY
        secmsmap = {}
        sbj = SiteDBJSON({"key":self.config.MyProxy.serverhostkey,
                          "cert":self.config.MyProxy.serverhostcert})

        wmfiles = []
        lumicounter = evecounter = 0
        for lfn, infos in datasetfiles.iteritems():
            wmfile = File(lfn=lfn, events=infos['NumberOfEvents'], size=infos['Size'], checksums=infos['Checksums'])
            wmfile['block'] = infos['BlockName']
            wmfile['locations'] = []
            if locations.has_key(infos['BlockName']):
                for se in locations[infos['BlockName']]:
                    if se not in secmsmap:
                        self.logger.debug("Translating SE %s" %se)
                        try:
                            secmsmap[se] = sbj.seToCMSName(se)
                        except KeyError, ke:
                            self.logger.error("Impossible translating %s to a CMS name through SiteDB" %se)
                            secmsmap[se] = ''
                    if se in secmsmap:
                        if type(secmsmap[se]) == list:
                            wmfile['locations'].extend(secmsmap[se])
                        else:
                            wmfile['locations'].append(secmsmap[se])
                wmfile['workflow'] = requestname
                evecounter += infos['NumberOfEvents']
                for run, lumis in infos['Lumis'].iteritems():
                    #self.logger.debug(' - adding run %d and lumis %s' %(run, lumis))
                    wmfile.addRun(Run(run, *lumis))
                    lumicounter += len(lumis)
                wmfiles.append(wmfile)

        self.logger.debug('Tot events found: %d' %evecounter)
        self.logger.debug('Tot lumis found: %d' %lumicounter)
        self.logger.debug('Tot files found: %d' %len(wmfiles))

        return Result(task=task, result=Fileset(name='FilesToSplit', files = set(wmfiles)))
Ejemplo n.º 17
0
 def actionWork(self, *args, **kwargs):
     """Performing the set of actions"""
     nextinput = args
     for work in self.getWorks():
         self.logger.debug("Starting %s on %s" % (str(work), self._task['tm_taskname']))
         t0 = time.time()
         try:
             output = work.execute(nextinput, task=self._task)
         except StopHandler, sh:
             msg = "Controlled stop of handler for %s on %s " % (self._task, str(sh))
             self.logger.error(msg)
             nextinput = Result(task=self._task, result='StopHandler exception received, controlled stop')
             break #exit normally. Worker will not notice there was an error
         except TaskWorkerException, twe:
             self.logger.debug(str(traceback.format_exc())) #print the stacktrace only in debug mode
             raise WorkerHandlerException(str(twe)) #TaskWorker error, do not add traceback to the error propagated to the REST
Ejemplo n.º 18
0
    def execute(self, *args, **kwargs):
        self.logger.info(
            "Data discovery and splitting for %s using user-provided files" %
            kwargs['task']['tm_taskname'])

        userfiles = kwargs['task']['tm_user_files']
        splitting = kwargs['task']['tm_split_algo']
        total_units = kwargs['task']['tm_totalunits']
        if not userfiles or splitting != 'FileBased':
            if not userfiles:
                msg = "No files specified to process for task %s." % kwargs[
                    'task']['tm_taskname']
            if splitting != 'FileBased':
                msg = "Data.splitting must be set to 'FileBased' when using a custom set of files."
            raise TaskWorkerException(msg)

        if hasattr(self.config.Sites, 'available'):
            locations = self.config.Sites.available
        else:
            with self.config.TaskWorker.envForCMSWEB:
                configDict = {
                    "cacheduration": 1,
                    "pycurl": True
                }  # cache duration is in hours
                resourceCatalog = CRIC(logger=self.logger,
                                       configDict=configDict)
                locations = resourceCatalog.getAllPSNs()

        userFileset = Fileset(name=kwargs['task']['tm_taskname'])
        self.logger.info("There are %d files specified by the user." %
                         len(userfiles))
        if total_units > 0:
            self.logger.info("Will run over the first %d files." % total_units)
        file_counter = 0
        for userfile, idx in zip(userfiles, range(len(userfiles))):
            newFile = File(userfile, size=1000, events=1)
            newFile.setLocation(locations)
            newFile.addRun(Run(1, idx))
            newFile["block"] = 'UserFilesFakeBlock'
            newFile["first_event"] = 1
            newFile["last_event"] = 2
            userFileset.addFile(newFile)
            file_counter += 1
            if total_units > 0 and file_counter >= total_units:
                break

        return Result(task=kwargs['task'], result=userFileset)
Ejemplo n.º 19
0
    def execute(self, *args, **kwargs):

        totalevents = kwargs['task']['tm_totalunits']
        firstEvent = 1
        lastEvent = totalevents
        firstLumi = 1
        lastLumi = 10

        #MC comes with only one MCFakeFile
        singleMCFileset = Fileset(name = "MCFakeFileSet")
        newFile = File("MCFakeFile", size = 1000, events = totalevents)
        newFile.setLocation(self.config.Sites.available)
        newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1)))
        newFile["block"] = 'MCFackBlock'
        newFile["first_event"] = firstEvent
        newFile["last_event"] = lastEvent
        singleMCFileset.addFile(newFile)

        return Result(task=kwargs['task'], result=singleMCFileset)
Ejemplo n.º 20
0
 def actionWork(self, *args, **kwargs):
     """Performing the set of actions"""
     nextinput = args
     for work in self.getWorks():
         self.logger.debug("Starting %s on %s" %
                           (str(work), self._task['tm_taskname']))
         t0 = time.time()
         try:
             output = work.execute(nextinput, task=self._task)
         except StopHandler, sh:
             msg = "Controlled stop of handler for %s on %s " % (self._task,
                                                                 str(sh))
             self.logger.error(msg)
             nextinput = Result(
                 task=self._task,
                 result='StopHandler exception received, controlled stop')
             break
         except Exception, exc:
             msg = "Problem handling %s because of %s failure, tracebak follows\n" % (
                 self._task['tm_taskname'], str(exc))
             msg += str(traceback.format_exc())
             self.logger.error(msg)
             raise WorkerHandlerException(msg)
Ejemplo n.º 21
0
    def execute(self, *args, **kwargs):
        result = None
        proxycfg = {
            'vo':
            kwargs['task']['tm_user_vo'],
            'logger':
            self.logger,
            'myProxySvr':
            self.config.Services.MyProxy,
            'proxyValidity':
            '144:0',
            'min_time_left':
            36000,  ## do we need this ? or should we use self.myproxylen?
            'userName':
            kwargs['task']['tm_username'] + '_CRAB',
            'userDN':
            kwargs['task']['tm_user_dn'],
            'group':
            kwargs['task']['tm_user_group']
            if kwargs['task']['tm_user_group'] else '',
            'role':
            kwargs['task']['tm_user_role']
            if kwargs['task']['tm_user_role'] else '',
            'server_key':
            self.config.MyProxy.serverhostkey,
            'server_cert':
            self.config.MyProxy.serverhostcert,
            'serverDN':
            'dummy',  # this is only used inside WMCore/Proxy.py functions not used by CRAB
            'uisource':
            getattr(self.config.MyProxy, 'uisource', ''),
            'credServerPath':
            self.config.MyProxy.credpath,
            'cleanEnvironment':
            getattr(self.config.MyProxy, 'cleanEnvironment', False)
        }
        try:
            self.logger.info(
                "try first to retrieve credential with login name %s",
                proxycfg['userName'])
            (userproxy, usergroups) = self.tryProxyLogon(proxycfg=proxycfg)
        except TaskWorkerException:
            self.logger.error(
                "proxy retrieval from %s failed with login name %s.",
                proxycfg['myProxySvr'], proxycfg['userName'])
            self.logger.error("will try with old-style DN hash")
            del proxycfg['userName']
            try:
                (userproxy, usergroups) = self.tryProxyLogon(proxycfg=proxycfg)
            except TaskWorkerException as ex:
                self.logger.error(
                    "proxy retrieval from %s failed with DN hash as credential name.",
                    proxycfg['myProxySvr'])
                raise TaskWorkerException(str(ex))
        #  minimal sanity check. Submission will fail if there's no group
        if not usergroups:
            raise TaskWorkerException(
                'Could not retrieve VOMS groups list from %s' % userproxy)
        kwargs['task']['user_proxy'] = userproxy
        kwargs['task']['user_groups'] = usergroups
        self.logger.debug("Valid proxy for %s now in %s", proxycfg['userDN'],
                          userproxy)
        result = Result(task=kwargs['task'], result='OK')

        return result
Ejemplo n.º 22
0
    def execute(self, *args, **kwargs):
        wmwork = Workflow(name=kwargs['task']['tm_taskname'])

        maxJobs = getattr(self.config.TaskWorker, 'maxJobsPerTask', 10000)

        data = args[0]
        splitparam = kwargs['task']['tm_split_args']
        splitparam['algorithm'] = kwargs['task']['tm_split_algo']
        if kwargs['task']['tm_job_type'] == 'Analysis':
            totalUnits = kwargs['task']['tm_totalunits']
            if kwargs['task']['tm_split_algo'] == 'FileBased':
                if totalUnits < 1.0:
                    totalUnits = int(totalUnits * len(data.getFiles()) + 0.5)
                splitparam['total_files'] = totalUnits
            elif kwargs['task']['tm_split_algo'] == 'LumiBased':
                if totalUnits < 1.0:
                    totalUnits = int(totalUnits * sum(len(run.lumis) for f in data.getFiles() for run in f['runs']) + 0.5)
                splitparam['total_lumis'] = totalUnits
            elif kwargs['task']['tm_split_algo'] == 'EventAwareLumiBased':
                if totalUnits < 1.0:
                    totalUnits = int(totalUnits * sum(f['events'] for f in data.getFiles()) + 0.5)
                splitparam['total_events'] = totalUnits
            elif kwargs['task']['tm_split_algo'] == 'Automatic':
                # REST backwards compatibility fix
                if 'seconds_per_job' in kwargs['task']['tm_split_args']:
                    kwargs['task']['tm_split_args']['minutes_per_job'] = kwargs['task']['tm_split_args'].pop('seconds_per_job')
                splitparam['algorithm'] = 'FileBased'
                splitparam['total_files'] = len(data.getFiles())
                numProbes = getattr(self.config.TaskWorker, 'numAutomaticProbes', 5)
                splitparam['files_per_job'] = (len(data.getFiles()) + numProbes - 1) // numProbes
        elif kwargs['task']['tm_job_type'] == 'PrivateMC':
            if 'tm_events_per_lumi' in kwargs['task'] and kwargs['task']['tm_events_per_lumi']:
                splitparam['events_per_lumi'] = kwargs['task']['tm_events_per_lumi']
            if 'tm_generator' in kwargs['task'] and kwargs['task']['tm_generator'] == 'lhe':
                splitparam['lheInputFiles'] = True
        splitparam['applyLumiCorrection'] = True

        wmsubs = Subscription(fileset=data, workflow=wmwork,
                               split_algo=splitparam['algorithm'],
                               type=self.jobtypeMapper[kwargs['task']['tm_job_type']])
        try:
            splitter = SplitterFactory()
            jobfactory = splitter(subscription=wmsubs)
            factory = jobfactory(**splitparam)
            numJobs = sum([len(jobgroup.getJobs()) for jobgroup in factory])
        except RuntimeError:
            msg = "The splitting on your task generated more than {0} jobs (the maximum).".format(maxJobs)
            raise TaskWorkerException(msg)
        if numJobs == 0:
            msg  = "The CRAB3 server backend could not submit any job to the Grid scheduler:"
            msg += " splitting task %s" % (kwargs['task']['tm_taskname'])
            if kwargs['task']['tm_input_dataset']:
                msg += " on dataset %s" % (kwargs['task']['tm_input_dataset'])
            msg += " with %s method does not generate any job. See\n" % (kwargs['task']['tm_split_algo'])
            msg += "https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3FAQ#crab_submit_fails_with_Splitting"
            raise TaskWorkerException(msg)
        elif numJobs > maxJobs:
            raise TaskWorkerException("The splitting on your task generated %s jobs. The maximum number of jobs in each task is %s" %
                                        (numJobs, maxJobs))

        minRuntime = getattr(self.config.TaskWorker, 'minAutomaticRuntimeMins', 180)
        if kwargs['task']['tm_split_algo'] == 'Automatic' and \
                kwargs['task']['tm_split_args']['minutes_per_job'] < minRuntime:
            msg = "Minimum runtime requirement for automatic splitting is {0} minutes.".format(minRuntime)
            raise TaskWorkerException(msg)

        #printing duplicated lumis if any
        lumiChecker = getattr(jobfactory, 'lumiChecker', None)
        if lumiChecker and lumiChecker.splitLumiFiles:
            self.logger.warning("The input dataset contains the following duplicated lumis %s", lumiChecker.splitLumiFiles.keys())
            msg = "The CRAB3 server backend detected lumis split across files in the input dataset."
            msg += " Will apply the necessary corrections in the splitting algorithm. You can ignore this message."
            self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname'])

        return Result(task = kwargs['task'], result = (factory, args[0]))
Ejemplo n.º 23
0
def processWorkerLoop(inputs, results, resthost, dbInstance, procnum, logger, logsDir):
    procName = "Process-%s" % procnum
    while True:
        try:
            ## Get (and remove) an item from the input queue. If the queue is empty, wait
            ## until an item is available. Item content is:
            ##  workid : an integer assigne by the queue module
            ##  work   : a function handler to the needed action e.g. function handleNewTask
            ##  task   : a task dictionary
            ##  failstatus : the status to assign to the task if work fails (e.g. 'SUBMITFAILED')
            workid, work, task, failstatus, inputargs = inputs.get()
            if work == 'STOP':
                break
            taskhandler = addTaskLogHandler(logger, task['tm_username'], task['tm_taskname'], logsDir)
        except (EOFError, IOError):
            crashMessage = "Hit EOF/IO in getting new work\n"
            crashMessage += "Assuming this is a graceful break attempt.\n"
            logger.error(crashMessage)
            break

        outputs = None
        t0 = time.time()
        #log entry below is used for logs parsing, therefore, changing it might require to update logstash configuration
        logger.debug("%s: Starting %s on %s", procName, str(work), task['tm_taskname'])
        try:
            msg = None
            outputs = work(resthost, dbInstance, WORKER_CONFIG, task, procnum, inputargs)
        except TapeDatasetException as tde:
            outputs = Result(task=task, err=str(tde))
        except WorkerHandlerException as we:
            outputs = Result(task=task, err=str(we))
            msg = str(we)
        except Exception as exc: #pylint: disable=broad-except
            outputs = Result(task=task, err=str(exc))
            msg = "%s: I just had a failure for %s" % (procName, str(exc))
            msg += "\n\tworkid=" + str(workid)
            msg += "\n\ttask=" + str(task['tm_taskname'])
            msg += "\n" + str(traceback.format_exc())
        finally:
            if msg:
                crabserver = CRABRest(resthost, WORKER_CONFIG.TaskWorker.cmscert, WORKER_CONFIG.TaskWorker.cmskey,
                                      retry=20, logger=logger, userAgent='CRABTaskWorker')
                crabserver.setDbInstance(dbInstance)
                failTask(task['tm_taskname'], crabserver, msg, logger, failstatus)
        t1 = time.time()
        workType = task.get('tm_task_command', 'RECURRING')
        #log entry below is used for logs parsing, therefore, changing it might require to update logstash configuration
        logger.debug("%s: %s work on %s completed in %d seconds: %s", procName, workType, task['tm_taskname'], t1-t0, outputs)

        try:
            out, _, _ = executeCommand("ps u -p %s | awk '{sum=sum+$6}; END {print sum/1024}'" % os.getpid())
            msg = "RSS after finishing %s: %s MB" % (task['tm_taskname'], out.strip())
            logger.debug(msg)
        except Exception:
            logger.exception("Problem getting worker RSS:")

        removeTaskLogHandler(logger, taskhandler)

        results.put({
                     'workid': workid,
                     'out' : outputs
                    })
Ejemplo n.º 24
0
def processWorkerLoop(inputs, results, resthost, resturi, procnum, logger):
    procName = "Process-%s" % procnum
    while True:
        try:
            ## Get (and remove) an item from the input queue. If the queue is empty, wait
            ## until an item is available.
            workid, work, task, failstatus, inputargs = inputs.get()
            if work == 'STOP':
                break
            taskhandler = addTaskLogHandler(logger, task['tm_username'],
                                            task['tm_taskname'])
        except (EOFError, IOError):
            crashMessage = "Hit EOF/IO in getting new work\n"
            crashMessage += "Assuming this is a graceful break attempt.\n"
            logger.error(crashMessage)
            break

        outputs = None
        t0 = time.time()
        logger.debug("%s: Starting %s on %s", procName, str(work),
                     task['tm_taskname'])
        try:
            msg = None
            outputs = work(resthost, resturi, WORKER_CONFIG, task, procnum,
                           inputargs)
        except TapeDatasetException as tde:
            outputs = Result(task=task, err=str(tde))
        except WorkerHandlerException as we:
            outputs = Result(task=task, err=str(we))
            msg = str(we)
        except Exception as exc:  #pylint: disable=broad-except
            outputs = Result(task=task, err=str(exc))
            msg = "%s: I just had a failure for %s" % (procName, str(exc))
            msg += "\n\tworkid=" + str(workid)
            msg += "\n\ttask=" + str(task['tm_taskname'])
            msg += "\n" + str(traceback.format_exc())
        finally:
            if msg:
                try:
                    logger.info("Uploading error message to REST: %s", msg)
                    server = HTTPRequests(resthost,
                                          WORKER_CONFIG.TaskWorker.cmscert,
                                          WORKER_CONFIG.TaskWorker.cmskey,
                                          retry=20,
                                          logger=logger)
                    truncMsg = truncateError(msg)
                    configreq = {
                        'workflow': task['tm_taskname'],
                        'status': failstatus,
                        'subresource': 'failure',
                        #limit the message to 7500 chars, which means no more than 10000 once encoded. That's the limit in the REST
                        'failure': b64encode(truncMsg)
                    }
                    server.post(resturi, data=urllib.urlencode(configreq))
                    logger.info(
                        "Error message successfully uploaded to the REST")
                except HTTPException as hte:
                    logger.warning(
                        "Cannot upload failure message to the REST for workflow %s. HTTP headers follows:",
                        task['tm_taskname'])
                    logger.error(hte.headers)
                except Exception as exc:  #pylint: disable=broad-except
                    logger.warning(
                        "Cannot upload failure message to the REST for workflow %s.\nReason: %s",
                        task['tm_taskname'], exc)
                    logger.exception('Traceback follows:')
        t1 = time.time()
        logger.debug("%s: ...work on %s completed in %d seconds: %s", procName,
                     task['tm_taskname'], t1 - t0, outputs)

        try:
            out, _, _ = executeCommand(
                "ps u -p %s | awk '{sum=sum+$6}; END {print sum/1024}'" %
                os.getpid())
            msg = "RSS after finishing %s: %s MB" % (task['tm_taskname'],
                                                     out.strip())
            logger.debug(msg)
        except:
            logger.exception("Problem getting worker RSS:")

        removeTaskLogHandler(logger, taskhandler)

        results.put({'workid': workid, 'out': outputs})
Ejemplo n.º 25
0
    def formatOutput(self, task, requestname, datasetfiles, locations,
                     tempDir):
        """
        Receives as input the result of the data location
        discovery operations and fill up the WMCore objects.
        """
        self.logger.debug(" Formatting data discovery output ")
        # TEMPORARY
        pnn_psn_map = {}
        sbj = SiteDBJSON({
            "key": self.config.TaskWorker.cmskey,
            "cert": self.config.TaskWorker.cmscert
        })

        wmfiles = []
        event_counter = 0
        lumi_counter = 0
        uniquelumis = set()
        datasetLumis = {}
        ## Loop over the sorted list of files.
        for lfn, infos in datasetfiles.iteritems():
            ## Skip the file if the block has not been found or has no locations.
            if not infos['BlockName'] in locations or not locations[
                    infos['BlockName']]:
                self.logger.warning(
                    "Skipping %s because its block (%s) has no locations" %
                    (lfn, infos['BlockName']))
                continue
            ## Skip the file if it is not in VALID state.
            if not infos.get('ValidFile', True):
                self.logger.warning("Skipping invalid file %s" % lfn)
                continue

            if task['tm_use_parent'] == 1 and len(infos['Parents']) == 0:
                raise TaskWorkerException(
                    "The CRAB3 server backend refuses to submit jobs to the Grid scheduler\n"
                    +
                    "because you specified useParents=True but some your files have no"
                    + "parents.\nExample: " + lfn)
            ## Create a WMCore File object.
            try:
                size = infos['FileSize']
                checksums = {
                    'Checksum': infos['Checksum'],
                    'Adler32': infos['Adler32'],
                    'Md5': infos['Md5']
                }
            except:
                #This is so that the task worker does not crash if an old version of WMCore is used (the interface of an API suddenly changed).
                # We may want to remove the try/except and the following two lines eventually, but keeping them for the moment so other devels won't be affected
                #See this WMCore commit: https://github.com/dmwm/WMCore/commit/2afc01ae571390f5fa009dd258be757adac89c28#diff-374b7a6640288184175057234e393e1cL204
                size = infos['Size']
                checksums = infos['Checksums']
            wmfile = File(lfn=lfn,
                          events=infos['NumberOfEvents'],
                          size=size,
                          checksums=checksums,
                          parents=infos['Parents'])
            wmfile['block'] = infos['BlockName']
            wmfile['locations'] = []
            for pnn in locations[infos['BlockName']]:
                if pnn and pnn not in pnn_psn_map:
                    self.logger.debug("Translating PNN %s" % pnn)
                    try:
                        pnn_psn_map[pnn] = sbj.PNNtoPSN(pnn)
                    except KeyError:
                        self.logger.error(
                            "Impossible translating %s to a CMS name through SiteDB"
                            % pnn)
                        pnn_psn_map[pnn] = ''
                    except httplib.HTTPException as ex:
                        self.logger.error("Couldn't map SE to site: %s" % pnn)
                        print("Couldn't map SE to site: %s" % pnn)
                        print("got problem: %s" % ex)
                        print("got another problem: %s" % ex.__dict__)
                if pnn and pnn in pnn_psn_map:
                    if isinstance(pnn_psn_map[pnn], list):
                        wmfile['locations'].extend(pnn_psn_map[pnn])
                    else:
                        wmfile['locations'].append(pnn_psn_map[pnn])
            wmfile['workflow'] = requestname
            event_counter += infos['NumberOfEvents']
            for run, lumis in infos['Lumis'].iteritems():
                datasetLumis.setdefault(run, []).extend(lumis)
                wmfile.addRun(Run(run, *lumis))
                for lumi in lumis:
                    uniquelumis.add((run, lumi))
                lumi_counter += len(lumis)
            wmfiles.append(wmfile)

        uniquelumis = len(uniquelumis)
        self.logger.debug('Tot events found: %d' % event_counter)
        self.logger.debug('Tot lumis found: %d' % uniquelumis)
        self.logger.debug('Duplicate lumis found: %d' %
                          (lumi_counter - uniquelumis))
        self.logger.debug('Tot files found: %d' % len(wmfiles))

        self.logger.debug(
            "Starting to create compact lumilists for input dataset")
        datasetLumiList = LumiList(runsAndLumis=datasetLumis)
        datasetLumis = datasetLumiList.getCompactList()
        datasetDuplicateLumis = datasetLumiList.getDuplicates().getCompactList(
        )
        self.logger.debug(
            "Finished to create compact lumilists for input dataset")
        with open(os.path.join(tempDir, "input_dataset_lumis.json"),
                  "w") as fd:
            json.dump(datasetLumis, fd)
        with open(os.path.join(tempDir, "input_dataset_duplicate_lumis.json"),
                  "w") as fd:
            json.dump(datasetDuplicateLumis, fd)

        return Result(task=task,
                      result=Fileset(name='FilesToSplit', files=set(wmfiles)))
Ejemplo n.º 26
0
def processWorkerLoop(inputs, results, resthost, resturi, procnum, logger,
                      logsDir):
    procName = "Process-%s" % procnum
    while True:
        try:
            ## Get (and remove) an item from the input queue. If the queue is empty, wait
            ## until an item is available.
            workid, work, task, failstatus, inputargs = inputs.get()
            if work == 'STOP':
                break
            taskhandler = addTaskLogHandler(logger, task['tm_username'],
                                            task['tm_taskname'], logsDir)
        except (EOFError, IOError):
            crashMessage = "Hit EOF/IO in getting new work\n"
            crashMessage += "Assuming this is a graceful break attempt.\n"
            logger.error(crashMessage)
            break

        outputs = None
        t0 = time.time()
        logger.debug("%s: Starting %s on %s", procName, str(work),
                     task['tm_taskname'])
        try:
            msg = None
            outputs = work(resthost, resturi, WORKER_CONFIG, task, procnum,
                           inputargs)
        except TapeDatasetException as tde:
            outputs = Result(task=task, err=str(tde))
        except WorkerHandlerException as we:
            outputs = Result(task=task, err=str(we))
            msg = str(we)
        except Exception as exc:  #pylint: disable=broad-except
            outputs = Result(task=task, err=str(exc))
            msg = "%s: I just had a failure for %s" % (procName, str(exc))
            msg += "\n\tworkid=" + str(workid)
            msg += "\n\ttask=" + str(task['tm_taskname'])
            msg += "\n" + str(traceback.format_exc())
        finally:
            if msg:
                server = HTTPRequests(resthost,
                                      WORKER_CONFIG.TaskWorker.cmscert,
                                      WORKER_CONFIG.TaskWorker.cmskey,
                                      retry=20,
                                      logger=logger)
                failTask(task['tm_taskname'], server, resturi, msg, logger,
                         failstatus)
        t1 = time.time()
        logger.debug("%s: ...work on %s completed in %d seconds: %s", procName,
                     task['tm_taskname'], t1 - t0, outputs)

        try:
            out, _, _ = executeCommand(
                "ps u -p %s | awk '{sum=sum+$6}; END {print sum/1024}'" %
                os.getpid())
            msg = "RSS after finishing %s: %s MB" % (task['tm_taskname'],
                                                     out.strip())
            logger.debug(msg)
        except:
            logger.exception("Problem getting worker RSS:")

        removeTaskLogHandler(logger, taskhandler)

        results.put({'workid': workid, 'out': outputs})
Ejemplo n.º 27
0
    def formatOutput(self, task, requestname, datasetfiles, locations,
                     tempDir):
        """
        Receives as input the result of the data location
        discovery operations and fill up the WMCore objects.
        """
        self.logger.debug(" Formatting data discovery output ")

        wmfiles = []
        event_counter = 0
        lumi_counter = 0
        uniquelumis = set()
        datasetLumis = {}
        blocksWithNoLocations = set()
        ## Loop over the sorted list of files.
        configDict = {
            "cacheduration": 1,
            "pycurl": True
        }  # cache duration is in hours
        with tempSetLogLevel(logger=self.logger, level=logging.ERROR):
            resourceCatalog = CRIC(logger=self.logger, configDict=configDict)
        # can't affort one message from CRIC per file, unless critical !
        with tempSetLogLevel(logger=self.logger, level=logging.ERROR):
            for lfn, infos in datasetfiles.iteritems():
                ## Skip the file if it is not in VALID state.
                if not infos.get('ValidFile', True):
                    self.logger.warning("Skipping invalid file %s", lfn)
                    continue
                ## Skip the file if the block has not been found or has no locations.
                if not infos['BlockName'] in locations or not locations[
                        infos['BlockName']]:
                    self.logger.warning(
                        "Skipping %s because its block (%s) has no locations",
                        lfn, infos['BlockName'])
                    blocksWithNoLocations.add(infos['BlockName'])
                    continue
                if task['tm_use_parent'] == 1 and len(infos['Parents']) == 0:
                    self.logger.warning(
                        "Skipping %s because it has no parents")
                    continue
                ## Create a WMCore File object.
                size = infos['FileSize']
                checksums = {
                    'Checksum': infos['Checksum'],
                    'Adler32': infos['Adler32'],
                    'Md5': infos['Md5']
                }
                wmfile = File(lfn=lfn,
                              events=infos['NumberOfEvents'],
                              size=size,
                              checksums=checksums,
                              parents=infos['Parents'])
                wmfile['block'] = infos['BlockName']
                try:
                    wmfile['locations'] = resourceCatalog.PNNstoPSNs(
                        locations[wmfile['block']])
                except Exception as ex:
                    self.logger.error(
                        "Impossible translating %s to a CMS name through CMS Resource Catalog",
                        locations[wmfile['block']])
                    self.logger.error("got this exception:\n %s", ex)
                    raise
                wmfile['workflow'] = requestname
                event_counter += infos['NumberOfEvents']
                for run, lumis in infos['Lumis'].iteritems():
                    datasetLumis.setdefault(run, []).extend(lumis)
                    wmfile.addRun(Run(run, *lumis))
                    for lumi in lumis:
                        uniquelumis.add((run, lumi))
                    lumi_counter += len(lumis)
                wmfiles.append(wmfile)

        if blocksWithNoLocations:
            msg = "%d blocks will be skipped because are not completely replicated on DISK: %s" % (
                len(blocksWithNoLocations), list(blocksWithNoLocations))
            self.logger.warning(msg)
            self.uploadWarning(msg, task['user_proxy'], task['tm_taskname'])

        uniquelumis = len(uniquelumis)
        self.logger.debug('Tot events found: %d', event_counter)
        self.logger.debug('Tot lumis found: %d', uniquelumis)
        self.logger.debug('Duplicate lumis found: %d',
                          (lumi_counter - uniquelumis))
        self.logger.debug('Tot files found: %d', len(wmfiles))

        self.logger.debug(
            "Starting to create compact lumilists for input dataset")
        datasetLumiList = LumiList(runsAndLumis=datasetLumis)
        datasetLumis = datasetLumiList.getCompactList()
        datasetDuplicateLumis = datasetLumiList.getDuplicates().getCompactList(
        )
        self.logger.debug(
            "Finished to create compact lumilists for input dataset")
        with open(os.path.join(tempDir, "input_dataset_lumis.json"),
                  "w") as fd:
            json.dump(datasetLumis, fd)
        with open(os.path.join(tempDir, "input_dataset_duplicate_lumis.json"),
                  "w") as fd:
            json.dump(datasetDuplicateLumis, fd)

        return Result(task=task,
                      result=Fileset(name='FilesToSplit', files=set(wmfiles)))
Ejemplo n.º 28
0
def processWorker(inputs, results, resthost, resturi, procnum):
    """Wait for an reference to appear in the input queue, call the referenced object
       and write the output in the output queue.

       :arg Queue inputs: the queue where the inputs are shared by the master
       :arg Queue results: the queue where this method writes the output
       :return: default returning zero, but not really needed."""
    logger = setProcessLogger(str(procnum))
    logger.info("Process %s is starting. PID %s", procnum, os.getpid())
    procName = "Process-%s" % procnum
    while True:
        try:
            ## Get (and remove) an item from the input queue. If the queue is empty, wait
            ## until an item is available.
            workid, work, task, failstatus, inputargs = inputs.get()
        except (EOFError, IOError):
            crashMessage = "Hit EOF/IO in getting new work\n"
            crashMessage += "Assuming this is a graceful break attempt.\n"
            logger.error(crashMessage)
            break
        if work == 'STOP':
            break

        outputs = None
        t0 = time.time()
        logger.debug("%s: Starting %s on %s" %
                     (procName, str(work), task['tm_taskname']))
        try:
            msg = None
            outputs = work(resthost, resturi, WORKER_CONFIG, task, procnum,
                           inputargs)
        except WorkerHandlerException as we:
            outputs = Result(task=task, err=str(we))
            msg = str(we)
        except Exception as exc:
            outputs = Result(task=task, err=str(exc))
            msg = "%s: I just had a failure for %s" % (procName, str(exc))
            msg += "\n\tworkid=" + str(workid)
            msg += "\n\ttask=" + str(task['tm_taskname'])
            msg += "\n" + str(traceback.format_exc())
        finally:
            if msg:
                try:
                    logger.info("Uploading error message to REST: %s" % msg)
                    server = HTTPRequests(resthost,
                                          WORKER_CONFIG.TaskWorker.cmscert,
                                          WORKER_CONFIG.TaskWorker.cmskey,
                                          retry=2)
                    truncMsg = truncateError(msg)
                    configreq = {
                        'workflow': task['tm_taskname'],
                        'status': failstatus,
                        'subresource': 'failure',
                        #limit the message to 7500 chars, which means no more than 10000 once encoded. That's the limit in the REST
                        'failure': b64encode(truncMsg)
                    }
                    server.post(resturi, data=urllib.urlencode(configreq))
                    logger.info(
                        "Error message successfully uploaded to the REST")
                except HTTPException as hte:
                    logger.warning(
                        "Cannot upload failure message to the REST for workflow %s. HTTP headers follows:"
                        % task['tm_taskname'])
                    logger.error(hte.headers)
                except Exception as exc:
                    logger.warning(
                        "Cannot upload failure message to the REST for workflow %s.\nReason: %s"
                        % (task['tm_taskname'], exc))
                    logger.exception('Traceback follows:')
        t1 = time.time()
        logger.debug("%s: ...work on %s completed in %d seconds: %s" %
                     (procName, task['tm_taskname'], t1 - t0, outputs))

        results.put({'workid': workid, 'out': outputs})
    logger.debug("Slave %s exiting." % procnum)
    return 0
Ejemplo n.º 29
0
    def formatOutput(self, task, requestname, datasetfiles, locations,
                     tempDir):
        """
        Receives as input the result of the data location
        discovery operations and fill up the WMCore objects.
        """
        self.logger.debug(" Formatting data discovery output ")

        wmfiles = []
        event_counter = 0
        lumi_counter = 0
        uniquelumis = set()
        datasetLumis = {}
        blocksWithNoLocations = set()
        ## Loop over the sorted list of files.
        configDict = {
            "cacheduration": 1,
            "pycurl": True
        }  # cache duration is in hours
        with tempSetLogLevel(logger=self.logger, level=logging.ERROR):
            resourceCatalog = CRIC(logger=self.logger, configDict=configDict)
        # can't affort one message from CRIC per file, unless critical !
        with tempSetLogLevel(logger=self.logger, level=logging.ERROR):
            for lfn, infos in datasetfiles.iteritems():
                ## Skip the file if it is not in VALID state.
                if not infos.get('ValidFile', True):
                    self.logger.warning("Skipping invalid file %s", lfn)
                    continue
                ## Skip the file if the block has not been found or has no locations.
                if not infos['BlockName'] in locations or not locations[
                        infos['BlockName']]:
                    self.logger.warning(
                        "Skipping %s because its block (%s) has no locations",
                        lfn, infos['BlockName'])
                    blocksWithNoLocations.add(infos['BlockName'])
                    continue

                if task['tm_use_parent'] == 1 and len(infos['Parents']) == 0:
                    raise TaskWorkerException(
                        "The CRAB3 server backend refuses to submit jobs to the Grid scheduler\n"
                        +
                        "because you specified useParents=True but some your files have no"
                        + "parents.\nExample: " + lfn)
                ## Create a WMCore File object.
                try:
                    size = infos['FileSize']
                    checksums = {
                        'Checksum': infos['Checksum'],
                        'Adler32': infos['Adler32'],
                        'Md5': infos['Md5']
                    }
                except:
                    #This is so that the task worker does not crash if an old version of WMCore is used (the interface of an API suddenly changed).
                    # We may want to remove the try/except and the following two lines eventually, but keeping them for the moment so other devels won't be affected
                    #See this WMCore commit: https://github.com/dmwm/WMCore/commit/2afc01ae571390f5fa009dd258be757adac89c28#diff-374b7a6640288184175057234e393e1cL204
                    size = infos['Size']
                    checksums = infos['Checksums']
                wmfile = File(lfn=lfn,
                              events=infos['NumberOfEvents'],
                              size=size,
                              checksums=checksums,
                              parents=infos['Parents'])
                wmfile['block'] = infos['BlockName']
                try:
                    wmfile['locations'] = resourceCatalog.PNNstoPSNs(
                        locations[wmfile['block']])
                except Exception as ex:
                    self.logger.error(
                        "Impossible translating %s to a CMS name through CMS Resource Catalog",
                        locations[wmfile['block']])
                    self.logger.error("got this exception:\n %s", ex)
                    raise
                wmfile['workflow'] = requestname
                event_counter += infos['NumberOfEvents']
                for run, lumis in infos['Lumis'].iteritems():
                    datasetLumis.setdefault(run, []).extend(lumis)
                    wmfile.addRun(Run(run, *lumis))
                    for lumi in lumis:
                        uniquelumis.add((run, lumi))
                    lumi_counter += len(lumis)
                wmfiles.append(wmfile)

        if blocksWithNoLocations:
            msg = "%d blocks will be skipped because are not completely replicated on DISK: %s" % (
                len(blocksWithNoLocations), list(blocksWithNoLocations))
            self.logger.warning(msg)
            self.uploadWarning(msg, task['user_proxy'], task['tm_taskname'])

        uniquelumis = len(uniquelumis)
        self.logger.debug('Tot events found: %d', event_counter)
        self.logger.debug('Tot lumis found: %d', uniquelumis)
        self.logger.debug('Duplicate lumis found: %d',
                          (lumi_counter - uniquelumis))
        self.logger.debug('Tot files found: %d', len(wmfiles))

        self.logger.debug(
            "Starting to create compact lumilists for input dataset")
        datasetLumiList = LumiList(runsAndLumis=datasetLumis)
        datasetLumis = datasetLumiList.getCompactList()
        datasetDuplicateLumis = datasetLumiList.getDuplicates().getCompactList(
        )
        self.logger.debug(
            "Finished to create compact lumilists for input dataset")
        with open(os.path.join(tempDir, "input_dataset_lumis.json"),
                  "w") as fd:
            json.dump(datasetLumis, fd)
        with open(os.path.join(tempDir, "input_dataset_duplicate_lumis.json"),
                  "w") as fd:
            json.dump(datasetDuplicateLumis, fd)

        return Result(task=task,
                      result=Fileset(name='FilesToSplit', files=set(wmfiles)))
Ejemplo n.º 30
0
    def execute(self, *args, **kwargs):
        wmwork = Workflow(name=kwargs['task']['tm_taskname'])

        wmsubs = Subscription(
            fileset=args[0],
            workflow=wmwork,
            split_algo=kwargs['task']['tm_split_algo'],
            type=self.jobtypeMapper[kwargs['task']['tm_job_type']])
        splitter = SplitterFactory()
        jobfactory = splitter(subscription=wmsubs)
        splitparam = kwargs['task']['tm_split_args']
        splitparam['algorithm'] = kwargs['task']['tm_split_algo']
        if kwargs['task']['tm_job_type'] == 'Analysis':
            if kwargs['task']['tm_split_algo'] == 'FileBased':
                splitparam['total_files'] = kwargs['task']['tm_totalunits']
            elif kwargs['task']['tm_split_algo'] == 'LumiBased':
                splitparam['total_lumis'] = kwargs['task']['tm_totalunits']
            elif kwargs['task']['tm_split_algo'] == 'EventAwareLumiBased':
                splitparam['total_events'] = kwargs['task']['tm_totalunits']
        elif kwargs['task']['tm_job_type'] == 'PrivateMC':
            if 'tm_events_per_lumi' in kwargs['task'] and kwargs['task'][
                    'tm_events_per_lumi']:
                splitparam['events_per_lumi'] = kwargs['task'][
                    'tm_events_per_lumi']
            if 'tm_generator' in kwargs['task'] and kwargs['task'][
                    'tm_generator'] == 'lhe':
                splitparam['lheInputFiles'] = True
        splitparam['applyLumiCorrection'] = True
        factory = jobfactory(**splitparam)
        numJobs = sum([len(jobgroup.getJobs()) for jobgroup in factory])
        maxJobs = getattr(self.config.TaskWorker, 'maxJobsPerTask', 10000)
        if numJobs == 0:
            msg = "The CRAB3 server backend could not submit any job to the Grid scheduler:"
            msg += " Splitting task %s" % (kwargs['task']['tm_taskname'])
            if kwargs['task']['tm_input_dataset']:
                msg += " on dataset %s" % (kwargs['task']['tm_input_dataset'])
            msg += " with %s method does not generate any job" % (
                kwargs['task']['tm_split_algo'])
            raise TaskWorkerException(msg)
        elif numJobs > maxJobs:
            raise TaskWorkerException(
                "The splitting on your task generated %s jobs. The maximum number of jobs in each task is %s"
                % (numJobs, maxJobs))
        #printing duplicated lumis if any
        lumiChecker = getattr(jobfactory, 'lumiChecker', None)
        if lumiChecker and lumiChecker.splitLumiFiles:
            self.logger.warning(
                "The input dataset contains the following duplicated lumis %s"
                % lumiChecker.splitLumiFiles.keys())
            #TODO use self.uploadWarning
            try:
                userServer = HTTPRequests(self.server['host'],
                                          kwargs['task']['user_proxy'],
                                          kwargs['task']['user_proxy'])
                configreq = {
                    'subresource':
                    'addwarning',
                    'workflow':
                    kwargs['task']['tm_taskname'],
                    'warning':
                    b64encode(
                        'The CRAB3 server backend detected lumis split across files in the input dataset.'
                        ' Will apply the necessary corrections in the splitting algorithms. You can ignore this message.'
                    )
                }
                userServer.post(self.restURInoAPI + '/task',
                                data=urllib.urlencode(configreq))
            except HTTPException as hte:
                self.logger.error(hte.headers)
                self.logger.warning(
                    "Cannot add warning to REST after finding duplicates")

        return Result(task=kwargs['task'], result=factory)