Example #1
0
    def testAddOutput(self):
        """
        _testAddOutput_

        Tests the addOutput functionality of the DataStructs Workflow.
        """
        filesetA = Fileset(name = "filesetA")
        filesetB = Fileset(name = "filesetB")
        filesetC = Fileset(name = "filesetC")

        testWorkflow = Workflow(spec = "test", owner = "mnorman")
        testWorkflow.addOutput("out1", filesetA, filesetB)
        testWorkflow.addOutput("out1", filesetB, filesetA)
        testWorkflow.addOutput("out2", filesetC)

        self.assertEqual(len(testWorkflow.outputMap["out1"]), 2,
                         "Error: There should be two mappings for out1.")
        self.assertEqual(len(testWorkflow.outputMap["out2"]), 1,
                         "Error: There should be two mappings for out2.")

        self.assertTrue({"output_fileset": filesetA,
                         "merged_output_fileset": filesetB} in testWorkflow.outputMap["out1"],
                        "Error: Fileset A should be in the output map.")
        self.assertTrue({"output_fileset": filesetB,
                         "merged_output_fileset": filesetA} in testWorkflow.outputMap["out1"],
                        "Error: Fileset B should be in the output map.")

        self.assertEqual(filesetC, testWorkflow.outputMap["out2"][0]["output_fileset"],
                        "Error: Fileset C should be in the output map.")
        self.assertEqual(None, testWorkflow.outputMap["out2"][0]["merged_output_fileset"],
                         "Error: The merged output should be None.")
        return
Example #2
0
    def __init__(
        self,
        spec=None,
        owner="unknown",
        dn="unknown",
        group="unknown",
        owner_vogroup="DEFAULT",
        owner_vorole="DEFAULT",
        name=None,
        task=None,
        wfType=None,
        id=-1,
    ):
        WMBSBase.__init__(self)
        WMWorkflow.__init__(
            self,
            spec=spec,
            owner=owner,
            dn=dn,
            group=group,
            owner_vogroup=owner_vogroup,
            owner_vorole=owner_vorole,
            name=name,
            task=task,
            wfType=wfType,
        )

        if self.dn == "unknown":
            self.dn = owner

        self.id = id
        return
Example #3
0
    def __init__(self,
                 spec=None,
                 owner="unknown",
                 dn="unknown",
                 group="unknown",
                 owner_vogroup="DEFAULT",
                 owner_vorole="DEFAULT",
                 name=None,
                 task=None,
                 wfType=None,
                 id=-1,
                 alternativeFilesetClose=False,
                 priority=None):
        WMBSBase.__init__(self)
        WMWorkflow.__init__(self,
                            spec=spec,
                            owner=owner,
                            dn=dn,
                            group=group,
                            owner_vogroup=owner_vogroup,
                            owner_vorole=owner_vorole,
                            name=name,
                            task=task,
                            wfType=wfType,
                            priority=priority)

        if self.dn == "unknown":
            self.dn = owner

        self.id = id
        self.alternativeFilesetClose = alternativeFilesetClose
        return
Example #4
0
 def makeWorkflow(self):
     """
     _makeWorkflow_
     
     Create a WMBS compatible Workflow structure that represents this
     task and the information contained within it
     
     """
     workflow = DataStructsWorkflow()
     workflow.task = self.getPathName()
     return workflow
Example #5
0
    def makeWorkflow(self):
        """
        _makeWorkflow_

        Create a WMBS compatible Workflow structure that represents this
        task and the information contained within it

        """
        workflow = DataStructsWorkflow()
        workflow.task = self.getPathName()
        return workflow
Example #6
0
    def __init__(self, spec = None, owner = None, dn = None, group = None,
                 owner_vogroup = '', owner_vorole = '',
                 name = None, task = None, wfType = None, id = -1):
        WMBSBase.__init__(self)
        WMWorkflow.__init__(self, spec = spec, owner = owner, dn = dn,
                            group = group, owner_vogroup = owner_vogroup,
                            owner_vorole = owner_vorole, name = name,
                            task = task, wfType = wfType)

        if not self.dn: self.dn = owner
        self.id = id
        return
Example #7
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.multipleFileFileset = Fileset(name="TestFileset1")
        for i in range(10):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.setLocation('blenheim')
            newFile.setLocation('malpaquet')
            self.multipleFileFileset.addFile(newFile)

        self.singleFileFileset = Fileset(name="TestFileset2")
        newFile = File("/some/file/name", size=1000, events=100)
        newFile.setLocation('blenheim')
        self.singleFileFileset.addFile(newFile)

        testWorkflow = Workflow()
        self.multipleFileSubscription = Subscription(
            fileset=self.multipleFileFileset,
            workflow=testWorkflow,
            split_algo="FileBased",
            type="Processing")
        self.singleFileSubscription = Subscription(
            fileset=self.singleFileFileset,
            workflow=testWorkflow,
            split_algo="FileBased",
            type="Processing")

        #self.multipleFileSubscription.create()
        #self.singleFileSubscription.create()

        return
Example #8
0
    def __init__(self, spec = None, owner = "unknown", dn = "unknown",
                 group = "unknown", owner_vogroup = "DEFAULT",
                 owner_vorole = "DEFAULT", name = None, task = None,
                 wfType = None, id = -1, alternativeFilesetClose = False):
        WMBSBase.__init__(self)
        WMWorkflow.__init__(self, spec = spec, owner = owner, dn = dn,
                            group = group, owner_vogroup = owner_vogroup,
                            owner_vorole = owner_vorole, name = name,
                            task = task, wfType = wfType)

        if self.dn == "unknown":
            self.dn = owner

        self.id = id
        self.alternativeFilesetClose = alternativeFilesetClose
        return
Example #9
0
    def execute(self, *args, **kwargs):

        wmwork = Workflow(name=kwargs['task']['tm_taskname'])

        wmsubs = Subscription(fileset=args[0], workflow=wmwork,
                               split_algo=kwargs['task']['tm_split_algo'],
                               type=self.jobtypeMapper[kwargs['task']['tm_job_type']])
        splitter = SplitterFactory()
        jobfactory = splitter(subscription=wmsubs)
        splitparam = kwargs['task']['tm_split_args']
        splitparam['algorithm'] = kwargs['task']['tm_split_algo']
        factory = jobfactory(**splitparam)
        if len(factory) == 0:
            # Understanding that no jobs could be created given the splitting arguments
            # with the given input dataset information: NO IDEA WHY.
            # NB: we assume that split can't happen, then task is failed
            msg = "Splitting %s on %s with %s does not generate any job" %(kwargs['task']['tm_taskname'],
                                                                           kwargs['task']['tm_input_dataset'],
                                                                           kwargs['task']['tm_split_algo'])
            self.logger.error("Setting %s as failed" % str(kwargs['task']['tm_taskname']))
            configreq = {'workflow': kwargs['task']['tm_taskname'],
                         'status': "FAILED",
                         'subresource': 'failure',
                         'failure': b64encode(msg)}
            self.server.post(self.resturl, data = urllib.urlencode(configreq))
            raise StopHandler(msg)
        return Result(task=kwargs['task'], result=factory)
Example #10
0
    def generateFakeMCFile(self,
                           numEvents=100,
                           firstEvent=1,
                           lastEvent=100,
                           firstLumi=1,
                           lastLumi=10,
                           existingSub=None):
        # MC comes with only one MCFakeFile
        newFile = File("MCFakeFileTest", size=1000, events=numEvents)
        newFile.setLocation('se01')
        if firstLumi == lastLumi:
            newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1)))
        else:
            newFile.addRun(Run(1, *range(firstLumi, lastLumi)))
        newFile["first_event"] = firstEvent
        newFile["last_event"] = lastEvent

        if existingSub is None:
            singleMCFileset = Fileset(name="MCTestFileset")
            singleMCFileset.addFile(newFile)
            testWorkflow = Workflow()
            existingSub = Subscription(fileset=singleMCFileset,
                                       workflow=testWorkflow,
                                       split_algo="EventBased",
                                       type="Production")
        else:
            existingSub['fileset'].addFile(newFile)

        return existingSub
Example #11
0
    def processDataset(self):
        """
        _processDataset_

        Import the Dataset contents and create a set of jobs from it

        """

        #  //
        # // Now create the job definitions
        #//
        logging.debug("SplitSize = %s" % self.splitSize)
        logging.debug("AllowedSites = %s" % self.allowedSites)
        thefiles = Fileset(name='FilesToSplit')
        reader = DBSReader(self.dbsUrl)
        fileList = reader.dbs.listFiles(
            analysisDataset=self.inputDataset(),
            retriveList=['retrive_block', 'retrive_run'])

        blocks = {}

        for f in fileList:
            block = f['Block']['Name']
            if not blocks.has_key(block):
                blocks[block] = reader.listFileBlockLocation(block)
            f['Block']['StorageElementList'].extend(blocks[block])
            wmbsFile = File(f['LogicalFileName'])
            [wmbsFile['locations'].add(x) for x in blocks[block]]
            wmbsFile['block'] = block
            thefiles.addFile(wmbsFile)

        work = Workflow()
        subs = Subscription(fileset=thefiles,
                            workflow=work,
                            split_algo='FileBased',
                            type="Processing")
        splitter = SplitterFactory()
        jobfactory = splitter(subs)

        jobs = jobfactory(files_per_job=self.splitSize)

        jobDefs = []
        for job in jobs.jobs:
            #job.mask.setMaxAndSkipEvents(-1, 0)
            jobDef = JobDefinition()
            jobDef['LFNS'].extend(job.listLFNs())
            jobDef['SkipEvents'] = 0
            jobDef['MaxEvents'] = -1
            [
                jobDef['SENames'].extend(list(x['locations']))
                for x in job.listFiles()
            ]
            jobDefs.append(jobDef)

        return jobDefs
Example #12
0
    def execute(self, *args, **kwargs):
        wmwork = Workflow(name=kwargs['task']['tm_taskname'])

        wmsubs = Subscription(
            fileset=args[0],
            workflow=wmwork,
            split_algo=kwargs['task']['tm_split_algo'],
            type=self.jobtypeMapper[kwargs['task']['tm_job_type']])
        splitter = SplitterFactory()
        jobfactory = splitter(subscription=wmsubs)
        splitparam = kwargs['task']['tm_split_args']
        splitparam['algorithm'] = kwargs['task']['tm_split_algo']
        if kwargs['task']['tm_job_type'] == 'Analysis':
            if kwargs['task']['tm_split_algo'] == 'FileBased':
                splitparam['total_files'] = kwargs['task']['tm_totalunits']
            elif kwargs['task']['tm_split_algo'] == 'LumiBased':
                splitparam['total_lumis'] = kwargs['task']['tm_totalunits']
        elif kwargs['task']['tm_job_type'] == 'PrivateMC':
            if 'tm_events_per_lumi' in kwargs['task'] and kwargs['task'][
                    'tm_events_per_lumi']:
                splitparam['events_per_lumi'] = kwargs['task'][
                    'tm_events_per_lumi']
            if 'tm_generator' in kwargs['task'] and kwargs['task'][
                    'tm_generator'] == 'lhe':
                splitparam['lheInputFiles'] = True
        splitparam['applyLumiCorrection'] = True
        factory = jobfactory(**splitparam)
        if len(factory) == 0:
            raise TaskWorkerException("The CRAB3 server backend could not submit any job to the Grid scheduler:\n"+\
                        "splitting task %s on dataset %s with %s method does not generate any job")
        #printing duplicated lumis if any
        lumiChecker = getattr(jobfactory, 'lumiChecker', None)
        if lumiChecker and lumiChecker.splitLumiFiles:
            self.logger.warning(
                "The input dataset contains the following duplicated lumis %s"
                % lumiChecker.splitLumiFiles.keys())
            try:
                configreq = {
                    'subresource':
                    'addwarning',
                    'workflow':
                    kwargs['task']['tm_taskname'],
                    'warning':
                    b64encode(
                        'The CRAB3 server backend detected lumis split across files in the input dataset.'
                        ' Will apply the necessary corrections in the splitting algorithms'
                    )
                }
                self.server.post(self.restURInoAPI + '/task',
                                 data=urllib.urlencode(configreq))
            except Exception, e:
                self.logger.error(e.headers)
                self.logger.warning(
                    "Cannot add warning to REST after finding duplicates")
Example #13
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """

        self.testWorkflow = Workflow()

        return
Example #14
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.multipleFileFileset = Fileset(name="TestFileset1")
        for i in range(10):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.addRun(Run(i, *[45 + i]))
            self.multipleFileFileset.addFile(newFile)

        self.singleFileFileset = Fileset(name="TestFileset2")
        newFile = File("/some/file/name", size=1000, events=100)
        newFile.addRun(Run(1, *[45]))
        self.singleFileFileset.addFile(newFile)

        self.multipleFileLumiset = Fileset(name="TestFileset3")
        for i in range(10):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.addRun(Run(1, *[45 + i / 3]))
            self.multipleFileLumiset.addFile(newFile)

        self.singleLumiFileset = Fileset(name="TestFileset4")
        for i in range(10):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.addRun(Run(1, *[45]))
            self.singleLumiFileset.addFile(newFile)

        testWorkflow = Workflow()
        self.multipleFileSubscription = Subscription(
            fileset=self.multipleFileFileset,
            workflow=testWorkflow,
            split_algo="FixedDelay",
            type="Processing")
        self.singleFileSubscription = Subscription(
            fileset=self.singleFileFileset,
            workflow=testWorkflow,
            split_algo="FixedDelay",
            type="Processing")
        self.multipleLumiSubscription = Subscription(
            fileset=self.multipleFileLumiset,
            workflow=testWorkflow,
            split_algo="FixedDelay",
            type="Processing")
        self.singleLumiSubscription = Subscription(
            fileset=self.singleLumiFileset,
            workflow=testWorkflow,
            split_algo="FixedDelay",
            type="Processing")

        return
Example #15
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.multipleFileFileset = Fileset(name="TestFileset1")
        for i in range(10):
            newFile = File(makeUUID(),
                           size=1000,
                           events=100,
                           locations=set(["somese.cern.ch"]))
            self.multipleFileFileset.addFile(newFile)

        self.singleFileFileset = Fileset(name="TestFileset2")
        newFile = File("/some/file/name",
                       size=1000,
                       events=100,
                       locations=set(["somese.cern.ch"]))
        self.singleFileFileset.addFile(newFile)

        self.multipleSiteFileset = Fileset(name="TestFileset3")
        for i in range(5):
            newFile = File(makeUUID(),
                           size=1000,
                           events=100,
                           locations=set(["somese.cern.ch"]))
            newFile.setLocation("somese.cern.ch")
            self.multipleSiteFileset.addFile(newFile)
        for i in range(5):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.setLocation(["somese.cern.ch", "otherse.cern.ch"])
            self.multipleSiteFileset.addFile(newFile)

        testWorkflow = Workflow()
        self.multipleFileSubscription = Subscription(
            fileset=self.multipleFileFileset,
            workflow=testWorkflow,
            split_algo="SizeBased",
            type="Processing")
        self.singleFileSubscription = Subscription(
            fileset=self.singleFileFileset,
            workflow=testWorkflow,
            split_algo="SizeBased",
            type="Processing")
        self.multipleSiteSubscription = Subscription(
            fileset=self.multipleSiteFileset,
            workflow=testWorkflow,
            split_algo="EventBased",
            type="Processing")
        return
Example #16
0
    def testProductionRunNumber(self):
        """
        _testProductionRunNumber_

        Verify that jobs created by production subscritpions have the correct
        run number is their job mask.  Also verify that non-production
        subscriptions don't have modified run numbers.
        """
        testWorkflow = Workflow(spec="spec.pkl",
                                owner="Steve",
                                name="TestWorkflow",
                                task="TestTask")

        testFileset = Fileset(name="TestFileset")
        testFile = File(lfn="someLFN")
        testFileset.addFile(testFile)
        testFileset.commit()

        testSubscription = Subscription(fileset=testFileset,
                                        workflow=testWorkflow,
                                        split_algo="FileBased",
                                        type="Production")

        myJobFactory = JobFactory(subscription=testSubscription)
        testJobGroups = myJobFactory()

        self.assertTrue(len(testJobGroups) > 0)
        for testJobGroup in testJobGroups:
            self.assertTrue(len(testJobGroup.jobs) > 0)
            for job in testJobGroup.jobs:
                self.assertEqual(job["mask"]["FirstRun"], 1,
                                 "Error: First run is wrong.")
                self.assertEqual(job["mask"]["LastRun"], 1,
                                 "Error: Last run is wrong.")

        testSubscription = Subscription(fileset=testFileset,
                                        workflow=testWorkflow,
                                        split_algo="FileBased",
                                        type="Processing")

        myJobFactory = JobFactory(subscription=testSubscription)
        testJobGroups = myJobFactory()

        for testJobGroup in testJobGroups:
            for job in testJobGroup.jobs:
                self.assertEqual(job["mask"]["FirstRun"], None,
                                 "Error: First run is wrong.")
                self.assertEqual(job["mask"]["LastRun"], None,
                                 "Error: Last run is wrong.")

        return
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """

        self.testWorkflow = Workflow()
        self.performanceParams = {'timePerEvent': 12,
                                  'memoryRequirement': 2300,
                                  'sizePerEvent': 400}

        return
Example #18
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.multipleFileFileset = Fileset(name="TestFileset1")
        for i in range(10):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.setLocation('se01')
            self.multipleFileFileset.addFile(newFile)

        self.singleFileFileset = Fileset(name="TestFileset2")
        newFile = File("/some/file/name", size=1000, events=100)
        newFile.setLocation('se02')
        self.singleFileFileset.addFile(newFile)

        self.emptyFileFileset = Fileset(name="TestFileset3")
        newFile = File("/some/file/name", size=1000, events=0)
        newFile.setLocation('se03')
        self.emptyFileFileset.addFile(newFile)

        testWorkflow = Workflow()
        self.multipleFileSubscription = Subscription(
            fileset=self.multipleFileFileset,
            workflow=testWorkflow,
            split_algo="EventBased",
            type="Processing")
        self.singleFileSubscription = Subscription(
            fileset=self.singleFileFileset,
            workflow=testWorkflow,
            split_algo="EventBased",
            type="Processing")
        self.emptyFileSubscription = Subscription(
            fileset=self.emptyFileFileset,
            workflow=testWorkflow,
            split_algo="EventBased",
            type="Processing")

        self.eventsPerJob = 100
        self.performanceParams = {
            'timePerEvent': None,
            'memoryRequirement': 2300,
            'sizePerEvent': 400
        }

        return
Example #19
0
 def generateFakeMCFile(self, numEvents = 100, firstEvent = 1,
                        lastEvent = 100, firstLumi = 1, lastLumi = 10):
     #MC comes with only one MCFakeFile
     singleMCFileset = Fileset(name = "MCTestFileset")
     newFile = File("MCFakeFileTest", size = 1000, events = numEvents)
     newFile.setLocation('se01')
     newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1)))
     newFile["first_event"] = firstEvent
     newFile["last_event"] = lastEvent
     testWorkflow = Workflow()
     singleMCFileset.addFile(newFile)
     singleMCFileSubscription = Subscription(fileset = singleMCFileset,
                                             workflow = testWorkflow,
                                             split_algo = "EventBased",
                                             type = "Production")
     return singleMCFileSubscription
Example #20
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.multipleFileFileset = Fileset(name="TestFileset1")
        for i in range(10):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.setLocation('blenheim')
            newFile.setLocation('malpaquet')
            lumis = []
            for lumi in range(20):
                lumis.append((i * 100) + lumi)
                newFile.addRun(Run(i, *lumis))
            self.multipleFileFileset.addFile(newFile)

        self.singleFileFileset = Fileset(name="TestFileset2")
        newFile = File("/some/file/name", size=1000, events=100)
        newFile.setLocation('blenheim')
        lumis = list(range(50, 60)) + list(range(70, 80))
        newFile.addRun(Run(13, *lumis))
        self.singleFileFileset.addFile(newFile)

        testWorkflow = Workflow()
        self.multipleFileSubscription = Subscription(
            fileset=self.multipleFileFileset,
            workflow=testWorkflow,
            split_algo="FileBased",
            type="Processing")
        self.singleFileSubscription = Subscription(
            fileset=self.singleFileFileset,
            workflow=testWorkflow,
            split_algo="FileBased",
            type="Processing")

        #self.multipleFileSubscription.create()
        #self.singleFileSubscription.create()

        self.performanceParams = {
            'timePerEvent': 12,
            'memoryRequirement': 2300,
            'sizePerEvent': 400
        }

        return
Example #21
0
    def setUp(self):
        """
        Initial Setup for Subscription Testcase

        Set a dummy Subscription with a fileset composed of one file inside it
        and a dummy workflow using the default constructor of the Workflow class

        """
        self.dummyFile = File('/tmp/dummyfile', 9999, 0, 0, 0)
        self.dummySet = set()
        self.dummySet.add(self.dummyFile)
        self.dummyFileSet = Fileset(name='SubscriptionTestFileset',
                                    files=self.dummySet)
        self.dummyWorkFlow = Workflow()
        self.dummySubscription = Subscription(fileset=self.dummyFileSet,
                                              workflow=self.dummyWorkFlow)
        return
Example #22
0
    def testDefinition(self):
        """
        Tests to make sure Workflow is defined correctly

        """

        testSpec = "test"
        testOwner = "mnorman"
        testName = "testName"

        testWorkflow = Workflow(spec=testSpec, owner=testOwner, name=testName)

        self.assertEqual(testWorkflow.spec, testSpec)
        self.assertEqual(testWorkflow.owner, testOwner)
        self.assertEqual(testWorkflow.name, testName)

        return
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """

        self.testWorkflow = Workflow()
        self.performanceParams = {'timePerEvent': 12, 'memoryRequirement': 2300, 'sizePerEvent': 400}

        logging.basicConfig()
        logging.getLogger().setLevel(logging.DEBUG)

        if PY3:
            self.assertItemsEqual = self.assertCountEqual

        return
Example #24
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.multipleFileFileset = Fileset(name="TestFileset1")
        for i in range(10):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.setLocation('se01')
            self.multipleFileFileset.addFile(newFile)

        self.singleFileFileset = Fileset(name="TestFileset2")
        newFile = File("/some/file/name", size=1000, events=100)
        newFile.setLocation('se02')
        self.singleFileFileset.addFile(newFile)

        self.emptyFileFileset = Fileset(name="TestFileset3")
        newFile = File("/some/file/name", size=1000, events=0)
        newFile.setdefault('se03')
        self.emptyFileFileset.addFile(newFile)

        testWorkflow = Workflow()
        self.multipleFileSubscription = Subscription(
            fileset=self.multipleFileFileset,
            workflow=testWorkflow,
            split_algo="EventBased",
            type="Processing")
        self.singleFileSubscription = Subscription(
            fileset=self.singleFileFileset,
            workflow=testWorkflow,
            split_algo="EventBased",
            type="Processing")
        self.emptyFileSubscription = Subscription(
            fileset=self.emptyFileFileset,
            workflow=testWorkflow,
            split_algo="EventBased",
            type="Processing")

        return
Example #25
0
    def testMetaData(self):
        """
        _testMetaData_

        Make sure that the workflow name, task, owner and white and black lists
        make it into each job object.
        """
        testWorkflow = Workflow(spec="spec.pkl",
                                owner="Steve",
                                name="TestWorkflow",
                                task="TestTask")

        testFileset = Fileset(name="TestFileset")
        testFile = File(lfn="someLFN")
        testFileset.addFile(testFile)
        testFileset.commit()

        testSubscription = Subscription(fileset=testFileset,
                                        workflow=testWorkflow,
                                        split_algo="FileBased")

        myJobFactory = JobFactory(subscription=testSubscription)
        testJobGroups = myJobFactory(siteWhitelist=["site1"],
                                     siteBlacklist=["site2"])
        self.assertTrue(len(testJobGroups) > 0)

        for testJobGroup in testJobGroups:
            self.assertTrue(len(testJobGroup.jobs) > 0)
            for job in testJobGroup.jobs:
                self.assertEqual(job["task"], "TestTask",
                                 "Error: Task is wrong.")
                self.assertEqual(job["workflow"], "TestWorkflow",
                                 "Error: Workflow is wrong.")
                self.assertEqual(job["owner"], "Steve",
                                 "Error: Owner is wrong.")
                self.assertEqual(job["siteWhitelist"], ["site1"],
                                 "Error: Site white list is wrong.")
                self.assertEqual(job["siteBlacklist"], ["site2"],
                                 "Error: Site black list is wrong.")
        return
Example #26
0
    def oneHundredFiles(self,
                        splittingAlgo="EventBased",
                        jobType="Processing"):
        """
        _oneHundredFiles_

        Generate a WMBS data stack representing 100 files for job splitter
        testing

        """
        fileset1 = Fileset(name='EventBasedFiles1')
        for i in range(0, 100):
            f = File(
                "/store/MultipleFileSplit%s.root" % i,  # lfn
                1000,  # size
                100,  # events
                10 + i,  # run
                12312  # lumi
            )
            f['locations'].add("BULLSHIT")

            fileset1.addFile(f)

        work = Workflow()
        subscription1 = Subscription(fileset=fileset1,
                                     workflow=work,
                                     split_algo=splittingAlgo,
                                     type=jobType)
        splitter = SplitterFactory()
        jobfactory = splitter(subscription1)
        jobs = jobfactory(events_per_job=100)
        #for jobGroup in jobs:
        #    yield jobGroup

        self.manager.addGenerator("RandomSeeder", **self.seedlistForRandom)
        self.manager.addGenerator("RunAndLumiSeeder")

        return jobs
Example #27
0
    def testAddOutput(self):
        """
        _testAddOutput_

        Tests the addOutput functionality of the DataStructs Workflow.
        """
        filesetA = Fileset(name="filesetA")
        filesetB = Fileset(name="filesetB")
        filesetC = Fileset(name="filesetC")

        testWorkflow = Workflow(spec="test", owner="mnorman")
        testWorkflow.addOutput("out1", filesetA, filesetB)
        testWorkflow.addOutput("out1", filesetB, filesetA)
        testWorkflow.addOutput("out2", filesetC)

        self.assertEqual(len(testWorkflow.outputMap["out1"]), 2,
                         "Error: There should be two mappings for out1.")
        self.assertEqual(len(testWorkflow.outputMap["out2"]), 1,
                         "Error: There should be two mappings for out2.")

        self.assertTrue({
            "output_fileset": filesetA,
            "merged_output_fileset": filesetB
        } in testWorkflow.outputMap["out1"],
                        "Error: Fileset A should be in the output map.")
        self.assertTrue({
            "output_fileset": filesetB,
            "merged_output_fileset": filesetA
        } in testWorkflow.outputMap["out1"],
                        "Error: Fileset B should be in the output map.")

        self.assertEqual(filesetC,
                         testWorkflow.outputMap["out2"][0]["output_fileset"],
                         "Error: Fileset C should be in the output map.")
        self.assertEqual(
            None, testWorkflow.outputMap["out2"][0]["merged_output_fileset"],
            "Error: The merged output should be None.")
        return
Example #28
0
    def processDataset(self):
        """
        _processDataset_

        Import the Dataset contents and create a set of jobs from it

        """

        #  //
        # // Now create the job definitions
        #//
        logging.debug("MergeSize = %s" % self.mergeSize)
        logging.debug("AllowedSites = %s" % self.allowedSites)
        logging.debug("Connection to DBS at: %s" % self.dbsUrl)

        reader = DBSReader(self.dbsUrl)
        blockList = reader.dbs.listBlocks(dataset=self.inputDataset())
        jobDefs = []

        for block in blockList:
            blockName = block['Name']
            logging.debug("Getting files for block %s" % blockName)
            locations = reader.listFileBlockLocation(blockName)
            fileList = reader.dbs.listFiles(blockName=blockName)
            if not fileList:  # Skip empty blocks
                continue

            thefiles = Fileset(name='FilesToSplit')
            for f in fileList:
                f['Block']['StorageElementList'].extend(locations)
                wmbsFile = File(f['LogicalFileName'])
                [wmbsFile['locations'].add(x) for x in locations]
                wmbsFile['block'] = blockName
                wmbsFile['size'] = f['FileSize']
                thefiles.addFile(wmbsFile)

            work = Workflow()
            subs = Subscription(fileset=thefiles,
                                workflow=work,
                                split_algo='MergeBySize',
                                type="Merge")
            logging.debug("Info for Subscription %s" % subs)
            splitter = SplitterFactory()
            jobfactory = splitter(subs)

            jobGroups = jobfactory(
                merge_size=self.mergeSize,  # min in Bytes
                all_files=True  # merge all files
            )
            if not jobGroups:
                raise (SyntaxError)
            for jobGroup in jobGroups:
                for job in jobGroup.getJobs():
                    jobDef = JobDefinition()
                    jobDef['LFNS'].extend(job.getFiles(type='lfn'))
                    jobDef['SkipEvents'] = 0
                    jobDef['MaxEvents'] = -1
                    [
                        jobDef['SENames'].extend(list(x['locations']))
                        for x in job.getFiles()
                    ]
                    jobDefs.append(jobDef)

        return jobDefs
Example #29
0
    def execute(self, *args, **kwargs):
        wmwork = Workflow(name=kwargs['task']['tm_taskname'])

        maxJobs = getattr(self.config.TaskWorker, 'maxJobsPerTask', 10000)

        data = args[0]
        splitparam = kwargs['task']['tm_split_args']
        splitparam['algorithm'] = kwargs['task']['tm_split_algo']
        if kwargs['task']['tm_job_type'] == 'Analysis':
            totalUnits = kwargs['task']['tm_totalunits']
            if kwargs['task']['tm_split_algo'] == 'FileBased':
                if totalUnits < 1.0:
                    totalUnits = int(totalUnits * len(data.getFiles()) + 0.5)
                splitparam['total_files'] = totalUnits
            elif kwargs['task']['tm_split_algo'] == 'LumiBased':
                if totalUnits < 1.0:
                    totalUnits = int(totalUnits * sum(len(run.lumis) for f in data.getFiles() for run in f['runs']) + 0.5)
                splitparam['total_lumis'] = totalUnits
            elif kwargs['task']['tm_split_algo'] == 'EventAwareLumiBased':
                if totalUnits < 1.0:
                    totalUnits = int(totalUnits * sum(f['events'] for f in data.getFiles()) + 0.5)
                splitparam['total_events'] = totalUnits
            elif kwargs['task']['tm_split_algo'] == 'Automatic':
                # REST backwards compatibility fix
                if 'seconds_per_job' in kwargs['task']['tm_split_args']:
                    kwargs['task']['tm_split_args']['minutes_per_job'] = kwargs['task']['tm_split_args'].pop('seconds_per_job')
                splitparam['algorithm'] = 'FileBased'
                splitparam['total_files'] = len(data.getFiles())
                numProbes = getattr(self.config.TaskWorker, 'numAutomaticProbes', 5)
                splitparam['files_per_job'] = (len(data.getFiles()) + numProbes - 1) // numProbes
        elif kwargs['task']['tm_job_type'] == 'PrivateMC':
            if 'tm_events_per_lumi' in kwargs['task'] and kwargs['task']['tm_events_per_lumi']:
                splitparam['events_per_lumi'] = kwargs['task']['tm_events_per_lumi']
            if 'tm_generator' in kwargs['task'] and kwargs['task']['tm_generator'] == 'lhe':
                splitparam['lheInputFiles'] = True
        splitparam['applyLumiCorrection'] = True

        wmsubs = Subscription(fileset=data, workflow=wmwork,
                               split_algo=splitparam['algorithm'],
                               type=self.jobtypeMapper[kwargs['task']['tm_job_type']])
        try:
            splitter = SplitterFactory()
            jobfactory = splitter(subscription=wmsubs)
            factory = jobfactory(**splitparam)
            numJobs = sum([len(jobgroup.getJobs()) for jobgroup in factory])
        except RuntimeError:
            msg = "The splitting on your task generated more than {0} jobs (the maximum).".format(maxJobs)
            raise TaskWorkerException(msg)
        if numJobs == 0:
            msg  = "The CRAB3 server backend could not submit any job to the Grid scheduler:"
            msg += " splitting task %s" % (kwargs['task']['tm_taskname'])
            if kwargs['task']['tm_input_dataset']:
                msg += " on dataset %s" % (kwargs['task']['tm_input_dataset'])
            msg += " with %s method does not generate any job. See\n" % (kwargs['task']['tm_split_algo'])
            msg += "https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3FAQ#crab_submit_fails_with_Splitting"
            raise TaskWorkerException(msg)
        elif numJobs > maxJobs:
            raise TaskWorkerException("The splitting on your task generated %s jobs. The maximum number of jobs in each task is %s" %
                                        (numJobs, maxJobs))

        minRuntime = getattr(self.config.TaskWorker, 'minAutomaticRuntimeMins', 180)
        if kwargs['task']['tm_split_algo'] == 'Automatic' and \
                kwargs['task']['tm_split_args']['minutes_per_job'] < minRuntime:
            msg = "Minimum runtime requirement for automatic splitting is {0} minutes.".format(minRuntime)
            raise TaskWorkerException(msg)

        #printing duplicated lumis if any
        lumiChecker = getattr(jobfactory, 'lumiChecker', None)
        if lumiChecker and lumiChecker.splitLumiFiles:
            self.logger.warning("The input dataset contains the following duplicated lumis %s", lumiChecker.splitLumiFiles.keys())
            msg = "The CRAB3 server backend detected lumis split across files in the input dataset."
            msg += " Will apply the necessary corrections in the splitting algorithm. You can ignore this message."
            self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname'])

        return Result(task = kwargs['task'], result = (factory, args[0]))
Example #30
0
    def jobSplittingByRun(self):
        """
        """

        self.checkUserSettings()
        blockSites = self.args['blockSites']
        pubdata = self.args['pubdata']

        if self.selectNumberOfJobs == 0:
            self.theNumberOfJobs = 9999999
        blocks = {}
        runList = []
        thefiles = Fileset(name='FilesToSplit')
        fileList = pubdata.getListFiles()
        for f in fileList:
            block = f['Block']['Name']
            try:
                f['Block']['StorageElementList'].extend(blockSites[block])
            except:
                continue
            wmbsFile = File(f['LogicalFileName'])
            if not blockSites[block]:
                msg = 'WARNING: No sites are hosting any part of data for block: %s\n' % block
                msg += 'Related jobs will not be submitted and this block of data can not be analyzed'
                common.logger.debug(msg)
            [wmbsFile['locations'].add(x) for x in blockSites[block]]
            wmbsFile['block'] = block
            runNum = f['RunsList'][0]['RunNumber']
            runList.append(runNum)
            myRun = Run(runNumber=runNum)
            wmbsFile.addRun(myRun)
            thefiles.addFile(wmbsFile)

        work = Workflow()
        subs = Subscription(fileset=thefiles,
                            workflow=work,
                            split_algo='RunBased',
                            type="Processing")
        splitter = SplitterFactory()
        jobfactory = splitter(subs)

        #loop over all runs
        list_of_lists = []
        jobDestination = []
        list_of_blocks = []
        count = 0
        for jobGroup in jobfactory():
            if count < self.theNumberOfJobs:
                res = self.getJobInfo(jobGroup)
                parString = ''
                for file in res['lfns']:
                    parString += file + ','
                list_of_blocks.append(res['block'])
                fullString = parString[:-1]
                blockString = ','.join(list_of_blocks)
                list_of_lists.append(
                    [fullString, str(-1),
                     str(0), blockString])
                #need to check single file location
                jobDestination.append(res['locations'])
                count += 1
        # prepare dict output
        dictOut = {}
        dictOut['params'] = [
            'InputFiles', 'MaxEvents', 'SkipEvents', 'InputBlocks'
        ]
        dictOut['args'] = list_of_lists
        dictOut['jobDestination'] = jobDestination
        dictOut['njobs'] = count
        self.cacheBlocks(list_of_blocks, jobDestination)

        return dictOut
Example #31
0
    def jobSplittingByLumi(self):
        """
        Split task into jobs by Lumi section paying attention to which
        lumis should be run (according to the analysis dataset).
        This uses WMBS job splitting which does not split files over jobs
        so the job will have AT LEAST as many lumis as requested, perhaps
        more
        """
        self.useParent = int(self.cfg_params.get('CMSSW.use_parent', 0))
        common.logger.debug('Splitting by Lumi')
        self.checkLumiSettings()

        blockSites = self.args['blockSites']
        pubdata = self.args['pubdata']

        lumisPerFile = pubdata.getLumis()
        self.parentFiles = pubdata.getParent()
        # Make the list of WMBS files for job splitter
        fileList = pubdata.getListFiles()
        wmFileList = []
        for jobFile in fileList:
            block = jobFile['Block']['Name']
            try:
                jobFile['Block']['StorageElementList'].extend(
                    blockSites[block])
            except:
                continue
            wmbsFile = File(jobFile['LogicalFileName'])
            if not blockSites[block]:
                msg = 'WARNING: No sites are hosting any part of data for block: %s\n' % block
                msg += 'Related jobs will not be submitted and this block of data can not be analyzed'
                common.logger.debug(msg)
            # wmbsFile['locations'].add('Nowhere')
            [wmbsFile['locations'].add(x) for x in blockSites[block]]
            wmbsFile['block'] = block
            for lumi in lumisPerFile[jobFile['LogicalFileName']]:
                wmbsFile.addRun(Run(lumi[0], lumi[1]))
            wmFileList.append(wmbsFile)

        fileSet = set(wmFileList)
        thefiles = Fileset(name='FilesToSplit', files=fileSet)

        # Create the factory and workflow
        work = Workflow()
        subs = Subscription(fileset=thefiles,
                            workflow=work,
                            split_algo='LumiBased',
                            type="Processing")
        splitter = SplitterFactory()
        jobFactory = splitter(subs)

        list_of_lists = []
        jobDestination = []
        jobCount = 0
        lumisCreated = 0
        list_of_blocks = []
        if not self.limitJobLumis:
            if self.totalNLumis > 0:
                self.lumisPerJob = max(
                    self.totalNLumis // self.theNumberOfJobs, 1)
            else:
                self.lumisPerJob = pubdata.getMaxLumis(
                ) // self.theNumberOfJobs + 1
            common.logger.info('Each job will process about %s lumis.' %
                               self.lumisPerJob)

        for jobGroup in jobFactory(lumis_per_job=self.lumisPerJob):
            for job in jobGroup.jobs:
                if (self.limitNJobs and jobCount >= self.theNumberOfJobs):
                    common.logger.info('Requested number of jobs reached.')
                    break
                if (self.limitTotalLumis and lumisCreated >= self.totalNLumis):
                    common.logger.info('Requested number of lumis reached.')
                    break
                lumis = []
                lfns = []
                if self.useParent == 1:
                    parentlfns = []
                    pString = ""

                locations = []
                blocks = []
                firstFile = True
                # Collect information from all the files
                for jobFile in job.getFiles():
                    doFile = False
                    if firstFile:  # Get locations from first file in the job
                        for loc in jobFile['locations']:
                            locations.append(loc)
                        blocks.append(jobFile['block'])
                        firstFile = False
                    # Accumulate Lumis from all files
                    for lumiList in jobFile['runs']:
                        theRun = lumiList.run
                        for theLumi in list(lumiList):
                            if (not self.limitTotalLumis) or \
                               (lumisCreated < self.totalNLumis):
                                doFile = True
                                lumisCreated += 1
                                lumis.append((theRun, theLumi))
                    if doFile:
                        lfns.append(jobFile['lfn'])
                        if self.useParent == 1:
                            parent = self.parentFiles[jobFile['lfn']]
                            for p in parent:
                                pString += p + ','
                fileString = ','.join(lfns)
                lumiLister = LumiList(lumis=lumis)
                lumiString = lumiLister.getCMSSWString()
                blockString = ','.join(blocks)
                if self.useParent == 1:
                    common.logger.debug("Files: " + fileString +
                                        " with the following parents: " +
                                        pString[:-1])
                    pfileString = pString[:-1]
                    list_of_lists.append([
                        fileString, pfileString,
                        str(-1),
                        str(0), lumiString, blockString
                    ])
                else:
                    list_of_lists.append(
                        [fileString,
                         str(-1),
                         str(0), lumiString, blockString])
                list_of_blocks.append(blocks)
                jobDestination.append(locations)
                jobCount += 1
                common.logger.debug(
                    'Job %s will run on %s files and %s lumis ' %
                    (jobCount, len(lfns), len(lumis)))

        common.logger.info('%s jobs created to run on %s lumis' %
                           (jobCount, lumisCreated))

        # Prepare dict output matching back to non-WMBS job creation
        dictOut = {}
        dictOut['params'] = [
            'InputFiles', 'MaxEvents', 'SkipEvents', 'Lumis', 'InputBlocks'
        ]
        if self.useParent == 1:
            dictOut['params'] = [
                'InputFiles', 'ParentFiles', 'MaxEvents', 'SkipEvents',
                'Lumis', 'InputBlocks'
            ]
        dictOut['args'] = list_of_lists
        dictOut['jobDestination'] = jobDestination
        dictOut['njobs'] = jobCount
        self.cacheBlocks(list_of_blocks, jobDestination)

        return dictOut
Example #32
0
    def execute(self, *args, **kwargs):
        wmwork = Workflow(name=kwargs['task']['tm_taskname'])

        wmsubs = Subscription(
            fileset=args[0],
            workflow=wmwork,
            split_algo=kwargs['task']['tm_split_algo'],
            type=self.jobtypeMapper[kwargs['task']['tm_job_type']])
        splitter = SplitterFactory()
        jobfactory = splitter(subscription=wmsubs)
        splitparam = kwargs['task']['tm_split_args']
        splitparam['algorithm'] = kwargs['task']['tm_split_algo']
        if kwargs['task']['tm_job_type'] == 'Analysis':
            if kwargs['task']['tm_split_algo'] == 'FileBased':
                splitparam['total_files'] = kwargs['task']['tm_totalunits']
            elif kwargs['task']['tm_split_algo'] == 'LumiBased':
                splitparam['total_lumis'] = kwargs['task']['tm_totalunits']
            elif kwargs['task']['tm_split_algo'] == 'EventAwareLumiBased':
                splitparam['total_events'] = kwargs['task']['tm_totalunits']
        elif kwargs['task']['tm_job_type'] == 'PrivateMC':
            if 'tm_events_per_lumi' in kwargs['task'] and kwargs['task'][
                    'tm_events_per_lumi']:
                splitparam['events_per_lumi'] = kwargs['task'][
                    'tm_events_per_lumi']
            if 'tm_generator' in kwargs['task'] and kwargs['task'][
                    'tm_generator'] == 'lhe':
                splitparam['lheInputFiles'] = True
        splitparam['applyLumiCorrection'] = True
        factory = jobfactory(**splitparam)
        numJobs = sum([len(jobgroup.getJobs()) for jobgroup in factory])
        maxJobs = getattr(self.config.TaskWorker, 'maxJobsPerTask', 10000)
        if numJobs == 0:
            msg = "The CRAB3 server backend could not submit any job to the Grid scheduler:"
            msg += " Splitting task %s" % (kwargs['task']['tm_taskname'])
            if kwargs['task']['tm_input_dataset']:
                msg += " on dataset %s" % (kwargs['task']['tm_input_dataset'])
            msg += " with %s method does not generate any job" % (
                kwargs['task']['tm_split_algo'])
            raise TaskWorkerException(msg)
        elif numJobs > maxJobs:
            raise TaskWorkerException(
                "The splitting on your task generated %s jobs. The maximum number of jobs in each task is %s"
                % (numJobs, maxJobs))
        #printing duplicated lumis if any
        lumiChecker = getattr(jobfactory, 'lumiChecker', None)
        if lumiChecker and lumiChecker.splitLumiFiles:
            self.logger.warning(
                "The input dataset contains the following duplicated lumis %s"
                % lumiChecker.splitLumiFiles.keys())
            #TODO use self.uploadWarning
            try:
                userServer = HTTPRequests(self.server['host'],
                                          kwargs['task']['user_proxy'],
                                          kwargs['task']['user_proxy'])
                configreq = {
                    'subresource':
                    'addwarning',
                    'workflow':
                    kwargs['task']['tm_taskname'],
                    'warning':
                    b64encode(
                        'The CRAB3 server backend detected lumis split across files in the input dataset.'
                        ' Will apply the necessary corrections in the splitting algorithms. You can ignore this message.'
                    )
                }
                userServer.post(self.restURInoAPI + '/task',
                                data=urllib.urlencode(configreq))
            except HTTPException as hte:
                self.logger.error(hte.headers)
                self.logger.warning(
                    "Cannot add warning to REST after finding duplicates")

        return Result(task=kwargs['task'], result=factory)