Example #1
0
    def __init__(self,
                 fileset=None,
                 workflow=None,
                 whitelist=None,
                 blacklist=None,
                 split_algo="FileBased",
                 type="Processing"):
        if fileset == None:
            fileset = Fileset()
        if whitelist == None:
            whitelist = set()
        if blacklist == None:
            blacklist = set()

        self.setdefault('fileset', fileset)
        self.setdefault('workflow', workflow)
        self.setdefault('type', type)

        self.setdefault('split_algo', split_algo)
        self.setdefault('whitelist', whitelist)
        self.setdefault('blacklist', blacklist)

        self.available = Fileset(name=fileset.name, files=fileset.getFiles())

        self.acquired = Fileset(name='acquired')
        self.completed = Fileset(name='completed')
        self.failed = Fileset(name='failed')
Example #2
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.multipleFileFileset = Fileset(name="TestFileset1")
        for i in range(10):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.setLocation('blenheim')
            newFile.setLocation('malpaquet')
            self.multipleFileFileset.addFile(newFile)

        self.singleFileFileset = Fileset(name="TestFileset2")
        newFile = File("/some/file/name", size=1000, events=100)
        newFile.setLocation('blenheim')
        self.singleFileFileset.addFile(newFile)

        testWorkflow = Workflow()
        self.multipleFileSubscription = Subscription(
            fileset=self.multipleFileFileset,
            workflow=testWorkflow,
            split_algo="FileBased",
            type="Processing")
        self.singleFileSubscription = Subscription(
            fileset=self.singleFileFileset,
            workflow=testWorkflow,
            split_algo="FileBased",
            type="Processing")

        #self.multipleFileSubscription.create()
        #self.singleFileSubscription.create()

        return
Example #3
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.multipleFileFileset = Fileset(name="TestFileset1")
        for i in range(10):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.addRun(Run(i, *[45 + i]))
            self.multipleFileFileset.addFile(newFile)

        self.singleFileFileset = Fileset(name="TestFileset2")
        newFile = File("/some/file/name", size=1000, events=100)
        newFile.addRun(Run(1, *[45]))
        self.singleFileFileset.addFile(newFile)

        self.multipleFileLumiset = Fileset(name="TestFileset3")
        for i in range(10):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.addRun(Run(1, *[45 + i / 3]))
            self.multipleFileLumiset.addFile(newFile)

        self.singleLumiFileset = Fileset(name="TestFileset4")
        for i in range(10):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.addRun(Run(1, *[45]))
            self.singleLumiFileset.addFile(newFile)

        testWorkflow = Workflow()
        self.multipleFileSubscription = Subscription(
            fileset=self.multipleFileFileset,
            workflow=testWorkflow,
            split_algo="FixedDelay",
            type="Processing")
        self.singleFileSubscription = Subscription(
            fileset=self.singleFileFileset,
            workflow=testWorkflow,
            split_algo="FixedDelay",
            type="Processing")
        self.multipleLumiSubscription = Subscription(
            fileset=self.multipleFileLumiset,
            workflow=testWorkflow,
            split_algo="FixedDelay",
            type="Processing")
        self.singleLumiSubscription = Subscription(
            fileset=self.singleLumiFileset,
            workflow=testWorkflow,
            split_algo="FixedDelay",
            type="Processing")

        return
Example #4
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.multipleFileFileset = Fileset(name="TestFileset1")
        for i in range(10):
            newFile = File(makeUUID(),
                           size=1000,
                           events=100,
                           locations=set(["somese.cern.ch"]))
            self.multipleFileFileset.addFile(newFile)

        self.singleFileFileset = Fileset(name="TestFileset2")
        newFile = File("/some/file/name",
                       size=1000,
                       events=100,
                       locations=set(["somese.cern.ch"]))
        self.singleFileFileset.addFile(newFile)

        self.multipleSiteFileset = Fileset(name="TestFileset3")
        for i in range(5):
            newFile = File(makeUUID(),
                           size=1000,
                           events=100,
                           locations=set(["somese.cern.ch"]))
            newFile.setLocation("somese.cern.ch")
            self.multipleSiteFileset.addFile(newFile)
        for i in range(5):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.setLocation(["somese.cern.ch", "otherse.cern.ch"])
            self.multipleSiteFileset.addFile(newFile)

        testWorkflow = Workflow()
        self.multipleFileSubscription = Subscription(
            fileset=self.multipleFileFileset,
            workflow=testWorkflow,
            split_algo="SizeBased",
            type="Processing")
        self.singleFileSubscription = Subscription(
            fileset=self.singleFileFileset,
            workflow=testWorkflow,
            split_algo="SizeBased",
            type="Processing")
        self.multipleSiteSubscription = Subscription(
            fileset=self.multipleSiteFileset,
            workflow=testWorkflow,
            split_algo="EventBased",
            type="Processing")
        return
Example #5
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.multipleFileFileset = Fileset(name="TestFileset1")
        for i in range(10):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.setLocation('se01')
            self.multipleFileFileset.addFile(newFile)

        self.singleFileFileset = Fileset(name="TestFileset2")
        newFile = File("/some/file/name", size=1000, events=100)
        newFile.setLocation('se02')
        self.singleFileFileset.addFile(newFile)

        self.emptyFileFileset = Fileset(name="TestFileset3")
        newFile = File("/some/file/name", size=1000, events=0)
        newFile.setLocation('se03')
        self.emptyFileFileset.addFile(newFile)

        testWorkflow = Workflow()
        self.multipleFileSubscription = Subscription(
            fileset=self.multipleFileFileset,
            workflow=testWorkflow,
            split_algo="EventBased",
            type="Processing")
        self.singleFileSubscription = Subscription(
            fileset=self.singleFileFileset,
            workflow=testWorkflow,
            split_algo="EventBased",
            type="Processing")
        self.emptyFileSubscription = Subscription(
            fileset=self.emptyFileFileset,
            workflow=testWorkflow,
            split_algo="EventBased",
            type="Processing")

        self.eventsPerJob = 100
        self.performanceParams = {
            'timePerEvent': None,
            'memoryRequirement': 2300,
            'sizePerEvent': 400
        }

        return
Example #6
0
    def testCommit(self):
        """
            Testcase for the commit method of the Fileset class

        """
        localTestFileSet = Fileset('LocalTestFileset', self.initialSet)
        fsSize = len(localTestFileSet.getFiles(type="lfn"))
        #Dummy file to test
        fileTestCommit = File('/tmp/filetestcommit', 0000, 1, 1)
        #File is added to the newfiles attribute of localTestFileSet
        localTestFileSet.addFile(fileTestCommit)
        assert fsSize == len(localTestFileSet.getFiles(type = "lfn")) - 1, 'file not added'\
                'correctly to test fileset'
        newfilestemp = localTestFileSet.newfiles
        assert fileTestCommit in newfilestemp, 'test file not in the new files'\
                'list'
        #After commit, dummy file is supposed to move from newfiles to files
        localTestFileSet.commit()
        #First, testing if the new file is present at file set object attribute of the Fileset object

        assert newfilestemp.issubset(localTestFileSet.files), 'Test file not ' \
                'present at fileset.files - fileset.commit ' \
                'not working properly'
        #Second, testing if the newfile set object attribute is empty
        assert localTestFileSet.newfiles == set(), \
                'Test file not present at fileset.newfiles ' \
                '- fileset.commit not working properly'
Example #7
0
    def testF_HardLimitSplittingOnly(self):
        """
        _testF_HardLimitSplittingOnly_

        Checks that we can split a set of files where every file has a single
        lumi too big to fit in a runnable job
        """
        splitter = SplitterFactory()

        # Create 3 single-big-lumi files
        testFileset = Fileset(name="FilesetA")
        testFileA = self.createFile("/this/is/file1", 1000, 0, 1,
                                    "somese.cern.ch")
        testFileB = self.createFile("/this/is/file2", 1000, 1, 1,
                                    "somese.cern.ch")
        testFileC = self.createFile("/this/is/file3", 1000, 2, 1,
                                    "somese.cern.ch")
        testFileset.addFile(testFileA)
        testFileset.addFile(testFileB)
        testFileset.addFile(testFileC)

        testSubscription = Subscription(fileset=testFileset,
                                        workflow=self.testWorkflow,
                                        split_algo="EventAwareLumiBased",
                                        type="Processing")
        jobFactory = splitter(package="WMCore.DataStructs",
                              subscription=testSubscription)
        # Settings are to split on job boundaries, to fail sing lumis with more than 800 events
        # and to put 550 events per job
        jobGroups = jobFactory(halt_job_on_file_boundaries=True,
                               splitOnRun=True,
                               events_per_job=550,
                               job_time_limit=9600,
                               performance=self.performanceParams)

        self.assertEqual(len(jobGroups), 1,
                         "There should be only one job group")
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 3, "Three jobs must be in the jobgroup")
        for i in range(0, 3):
            self.assertTrue(jobs[i]['failedOnCreation'],
                            "It should have been marked as failed")

            runNums = jobs[i]['mask']['runAndLumis'].keys()
            self.assertEqual(len(runNums), 1)

            lumiNums = jobs[i]['mask']['runAndLumis'].values()[0]
            self.assertEqual(len(lumiNums), 1)

            finalLumi = []
            for pair in lumiNums:
                finalLumi.extend(range(pair[0], pair[1] + 1))
            self.assertEqual(len(finalLumi), 1)

            self.assertEqual(
                jobs[i]['failedReason'],
                "File /this/is/file%d has a single lumi %s, in run %s with too many events 1000 and it woud take 12000 sec to run"
                % (i + 1, finalLumi[0], runNums[0]))

        return
    def testHardLimitSplittingOnly(self):
        """
        _testHardLimitSplittingOnly_

        Checks that we can split a set of files where every file has a single
        lumi too big to fit in a runnable job
        """
        splitter = SplitterFactory()

        # Create 3 single-big-lumi files
        testFileset = Fileset(name="FilesetA")
        testFileA = self.createFile("/this/is/file1", 1000, 0, 1, "somese.cern.ch")
        testFileB = self.createFile("/this/is/file2", 1000, 1, 1, "somese.cern.ch")
        testFileC = self.createFile("/this/is/file3", 1000, 2, 1, "somese.cern.ch")
        testFileset.addFile(testFileA)
        testFileset.addFile(testFileB)
        testFileset.addFile(testFileC)

        testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow,
                                        split_algo="EventAwareLumiByWork", type="Processing")
        jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription)

        # Fail single lumis with more than 800 events and put 550 events per job
        jobGroups = jobFactory(halt_job_on_file_boundaries=True, splitOnRun=True, events_per_job=550,
                               job_time_limit=9600, performance=self.performanceParams)

        self.assertEqual(len(jobGroups), 1)
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 3)
        for job in jobs:
            self.assertTrue(job['failedOnCreation'])
            self.assertIn(' with too many events 1000 and it woud take 12000 sec to run', job['failedReason'])

        return
Example #9
0
    def createSubscription(self,
                           nFiles,
                           lumisPerFile,
                           twoSites=False,
                           nEventsPerFile=100):
        """
        _createSubscription_

        Create a subscription for testing
        """

        baseName = makeUUID()

        testFileset = Fileset(name=baseName)
        for i in range(nFiles):
            newFile = self.createFile('%s_%i' % (baseName, i), nEventsPerFile,
                                      i, lumisPerFile, 'blenheim')
            testFileset.addFile(newFile)
        if twoSites:
            for i in range(nFiles):
                newFile = self.createFile('%s_%i_2' % (baseName, i),
                                          nEventsPerFile, i, lumisPerFile,
                                          'malpaquet')
                testFileset.addFile(newFile)

        testSubscription = Subscription(fileset=testFileset,
                                        workflow=self.testWorkflow,
                                        split_algo="EventAwareLumiBased",
                                        type="Processing")

        return testSubscription
Example #10
0
    def execute(self, *args, **kwargs):  #pylint: disable=unused-argument

        # since https://github.com/dmwm/CRABServer/issues/5633 totalunits can be a float
        # but that would confuse WMCore, therefore cast to int
        totalevents = int(kwargs['task']['tm_totalunits'])
        firstEvent = 1
        lastEvent = totalevents
        firstLumi = 1
        lastLumi = 10

        # Set a default of 100 events per lumi.  This is set as a task
        # property, as the splitting considers it independently of the file
        # information provided by the fake dataset.
        if not kwargs['task']['tm_events_per_lumi']:
            kwargs['task']['tm_events_per_lumi'] = 100

        #MC comes with only one MCFakeFile
        singleMCFileset = Fileset(name="MCFakeFileSet")
        newFile = File("MCFakeFile", size=1000, events=totalevents)
        newFile.setLocation(self.getListOfSites())
        newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1)))
        newFile["block"] = 'MCFakeBlock'
        newFile["first_event"] = firstEvent
        newFile["last_event"] = lastEvent
        singleMCFileset.addFile(newFile)

        return Result(task=kwargs['task'], result=singleMCFileset)
Example #11
0
    def setUp(self):
        """
        Create a dummy fileset and populate it with random files,
        in order to use it for the testcase methods

        """
        logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                    datefmt='%m-%d %H:%M',
                    filename=__file__.replace('.py','.log'),
                    filemode='w')
        self.logger = logging.getLogger('FilesetClassTest')

        #Setup the initial testcase environment:
        initialfile = File('/tmp/lfn1',1000,1,1,1)
        self.initialSet = set()
        self.initialSet.add(initialfile)

        #Create a Fileset, containing a initial file on it.
        self.fileset = Fileset(name = 'self.fileset', files = self.initialSet)
        #Populate the fileset with random files
        for i in range(1,1000):
            lfn = '/store/data/%s/%s/file.root' % (random.randint(1000, 9999),
                                              random.randint(1000, 9999))
            size = random.randint(1000, 2000)
            events = 1000
            run = random.randint(0, 2000)
            lumi = random.randint(0, 8)

            file = File(lfn=lfn, size=size, events=events, checksums = {"cksum": "1"})
            file.addRun(Run(run, *[lumi]))
            self.fileset.addFile(file)
    def testG_LumiMask(self):
        """
        _testG_LumiMask_

        Test that we can use a lumi-mask to filter good runs/lumis.
        """
        splitter = SplitterFactory()

        # Create 3 files with 100 events per lumi:
        # - file1 with 1 run  of 8 lumis
        # - file2 with 2 runs of 2 lumis each
        # - file3 with 1 run  of 5 lumis
        fileA = File(lfn="/this/is/file1", size=1000, events=800)
        fileB = File(lfn="/this/is/file2", size=1000, events=400)
        fileC = File(lfn="/this/is/file3", size=1000, events=500)

        lumiListA = []
        for lumi in range(8):
            lumiListA.append(10 + lumi)
        fileA.addRun(Run(1, *lumiListA))
        fileA.setLocation("somese.cern.ch")
        lumiListB1 = []
        lumiListB2 = []
        for lumi in range(2):
            lumiListB1.append(20 + lumi)
            lumiListB2.append(30 + lumi)
        fileB.addRun(Run(2, *lumiListB1))
        fileB.addRun(Run(3, *lumiListB2))
        fileB.setLocation("somese.cern.ch")
        lumiListC = []
        for lumi in range(5):
            lumiListC.append(40 + lumi)
        fileC.addRun(Run(4, *lumiListC))
        fileC.setLocation("somese.cern.ch")

        testFileset = Fileset(name='Fileset')
        testFileset.addFile(fileA)
        testFileset.addFile(fileB)
        testFileset.addFile(fileC)

        testSubscription = Subscription(fileset=testFileset,
                                        workflow=self.testWorkflow,
                                        split_algo="EventAwareLumiBased",
                                        type="Processing")
        jobFactory = splitter(package="WMCore.DataStructs",
                              subscription=testSubscription)

        # Use a lumi-mask = {1: [[10,14]], 2: [[20,21]], 4: [[40,41]]}
        jobGroups = jobFactory(halt_job_on_file_boundaries=False,
                               splitOnRun=False,
                               events_per_job=850,
                               runs=['1', '2', '4'],
                               lumis=['10,14', '20,21', '40,41'],
                               performance=self.performanceParams)

        self.assertEqual(len(jobGroups), 1, "There should be only one job group")
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 2, "Two jobs must be in the jobgroup")
        self.assertEqual(jobs[0]['mask'].getRunAndLumis(), {1: [[10, 14]], 2: [[20, 21]], 4: [[40, 40]]})
        self.assertEqual(jobs[1]['mask'].getRunAndLumis(), {4: [[41, 41]]})
Example #13
0
    def execute(self, *args, **kwargs):

        totalevents = kwargs['task']['tm_totalunits']
        firstEvent = 1
        lastEvent = totalevents
        firstLumi = 1
        lastLumi = 10

        # Set a default of 100 events per lumi.  This is set as a task
        # property, as the splitting considers it independently of the file
        # information provided by the fake dataset.
        if not kwargs['task']['tm_events_per_lumi']:
            kwargs['task']['tm_events_per_lumi'] = 100

        #MC comes with only one MCFakeFile
        singleMCFileset = Fileset(name = "MCFakeFileSet")
        newFile = File("MCFakeFile", size = 1000, events = totalevents)
        if hasattr(self.config.Sites, 'available'):
            newFile.setLocation(self.config.Sites.available)
        else:
            sbj = SiteDBJSON({"key":self.config.TaskWorker.cmskey,
                              "cert":self.config.TaskWorker.cmscert})
            newFile.setLocation(sbj.getAllCMSNames())
        newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1)))
        newFile["block"] = 'MCFackBlock'
        newFile["first_event"] = firstEvent
        newFile["last_event"] = lastEvent
        singleMCFileset.addFile(newFile)

        return Result(task=kwargs['task'], result=singleMCFileset)
Example #14
0
    def generateFakeMCFile(self,
                           numEvents=100,
                           firstEvent=1,
                           lastEvent=100,
                           firstLumi=1,
                           lastLumi=10,
                           existingSub=None):
        # MC comes with only one MCFakeFile
        newFile = File("MCFakeFileTest", size=1000, events=numEvents)
        newFile.setLocation('se01')
        if firstLumi == lastLumi:
            newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1)))
        else:
            newFile.addRun(Run(1, *range(firstLumi, lastLumi)))
        newFile["first_event"] = firstEvent
        newFile["last_event"] = lastEvent

        if existingSub is None:
            singleMCFileset = Fileset(name="MCTestFileset")
            singleMCFileset.addFile(newFile)
            testWorkflow = Workflow()
            existingSub = Subscription(fileset=singleMCFileset,
                                       workflow=testWorkflow,
                                       split_algo="EventBased",
                                       type="Production")
        else:
            existingSub['fileset'].addFile(newFile)

        return existingSub
Example #15
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.multipleFileFileset = Fileset(name="TestFileset1")
        for i in range(10):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.setLocation('blenheim')
            newFile.setLocation('malpaquet')
            lumis = []
            for lumi in range(20):
                lumis.append((i * 100) + lumi)
                newFile.addRun(Run(i, *lumis))
            self.multipleFileFileset.addFile(newFile)

        self.singleFileFileset = Fileset(name="TestFileset2")
        newFile = File("/some/file/name", size=1000, events=100)
        newFile.setLocation('blenheim')
        lumis = list(range(50, 60)) + list(range(70, 80))
        newFile.addRun(Run(13, *lumis))
        self.singleFileFileset.addFile(newFile)

        testWorkflow = Workflow()
        self.multipleFileSubscription = Subscription(
            fileset=self.multipleFileFileset,
            workflow=testWorkflow,
            split_algo="FileBased",
            type="Processing")
        self.singleFileSubscription = Subscription(
            fileset=self.singleFileFileset,
            workflow=testWorkflow,
            split_algo="FileBased",
            type="Processing")

        #self.multipleFileSubscription.create()
        #self.singleFileSubscription.create()

        self.performanceParams = {
            'timePerEvent': 12,
            'memoryRequirement': 2300,
            'sizePerEvent': 400
        }

        return
Example #16
0
 def testCall(self):
     fileset = Fileset(name="FakeFeederTest")
     for i in range(1, 21):
         self.feeder([fileset])
         set = fileset.getFiles(type="set")
         if len(set) > 0:
             file = set.pop()
         fileset.commit()
Example #17
0
    def processDataset(self):
        """
        _processDataset_

        Import the Dataset contents and create a set of jobs from it

        """

        #  //
        # // Now create the job definitions
        #//
        logging.debug("SplitSize = %s" % self.splitSize)
        logging.debug("AllowedSites = %s" % self.allowedSites)
        thefiles = Fileset(name='FilesToSplit')
        reader = DBSReader(self.dbsUrl)
        fileList = reader.dbs.listFiles(
            analysisDataset=self.inputDataset(),
            retriveList=['retrive_block', 'retrive_run'])

        blocks = {}

        for f in fileList:
            block = f['Block']['Name']
            if not blocks.has_key(block):
                blocks[block] = reader.listFileBlockLocation(block)
            f['Block']['StorageElementList'].extend(blocks[block])
            wmbsFile = File(f['LogicalFileName'])
            [wmbsFile['locations'].add(x) for x in blocks[block]]
            wmbsFile['block'] = block
            thefiles.addFile(wmbsFile)

        work = Workflow()
        subs = Subscription(fileset=thefiles,
                            workflow=work,
                            split_algo='FileBased',
                            type="Processing")
        splitter = SplitterFactory()
        jobfactory = splitter(subs)

        jobs = jobfactory(files_per_job=self.splitSize)

        jobDefs = []
        for job in jobs.jobs:
            #job.mask.setMaxAndSkipEvents(-1, 0)
            jobDef = JobDefinition()
            jobDef['LFNS'].extend(job.listLFNs())
            jobDef['SkipEvents'] = 0
            jobDef['MaxEvents'] = -1
            [
                jobDef['SENames'].extend(list(x['locations']))
                for x in job.listFiles()
            ]
            jobDefs.append(jobDef)

        return jobDefs
Example #18
0
    def setUp(self):
        """
        _setUp_

        Create two subscriptions: One that contains a single file and one that
        contains multiple files.
        """
        self.multipleFileFileset = Fileset(name="TestFileset1")
        for i in range(10):
            newFile = File(makeUUID(), size=1000, events=100)
            newFile.setLocation('se01')
            self.multipleFileFileset.addFile(newFile)

        self.singleFileFileset = Fileset(name="TestFileset2")
        newFile = File("/some/file/name", size=1000, events=100)
        newFile.setLocation('se02')
        self.singleFileFileset.addFile(newFile)

        self.emptyFileFileset = Fileset(name="TestFileset3")
        newFile = File("/some/file/name", size=1000, events=0)
        newFile.setdefault('se03')
        self.emptyFileFileset.addFile(newFile)

        testWorkflow = Workflow()
        self.multipleFileSubscription = Subscription(
            fileset=self.multipleFileFileset,
            workflow=testWorkflow,
            split_algo="EventBased",
            type="Processing")
        self.singleFileSubscription = Subscription(
            fileset=self.singleFileFileset,
            workflow=testWorkflow,
            split_algo="EventBased",
            type="Processing")
        self.emptyFileSubscription = Subscription(
            fileset=self.emptyFileFileset,
            workflow=testWorkflow,
            split_algo="EventBased",
            type="Processing")

        return
Example #19
0
    def execute(self, *args, **kwargs):
        self.logger.info(
            "Data discovery and splitting for %s using user-provided files" %
            kwargs['task']['tm_taskname'])

        userfiles = kwargs['task']['tm_arguments'].get('userfiles')
        splitting = kwargs['task']['tm_split_algo']
        total_units = kwargs['task']['tm_totalunits']
        if not userfiles or splitting != 'FileBased':
            if not userfiles:
                msg = "No files specified to process for task %s." % kwargs[
                    'task']['tm_taskname']
            if splitting != 'FileBased':
                msg = "Data.splitting must be set to 'FileBased' when using a custom set of files."
            self.logger.error("Setting %s as failed: %s" %
                              (kwargs['task']['tm_taskname'], msg))
            configreq = {
                'workflow': kwargs['task']['tm_taskname'],
                'status': "FAILED",
                'subresource': 'failure',
                'failure': b64encode(msg)
            }
            self.server.post(self.resturi, data=urllib.urlencode(configreq))
            raise StopHandler(msg)

        if hasattr(self.config.Sites, 'available'):
            locations = self.config.Sites.available
        else:
            sbj = SiteDBJSON({
                "key": self.config.TaskWorker.cmskey,
                "cert": self.config.TaskWorker.cmscert
            })
            locations = sbj.getAllCMSNames()

        userFileset = Fileset(name=kwargs['task']['tm_taskname'])
        self.logger.info("There are %d files specified by the user." %
                         len(userfiles))
        if total_units > 0:
            self.logger.info("Will run over the first %d files." % total_units)
        file_counter = 0
        for userfile, idx in zip(userfiles, range(len(userfiles))):
            newFile = File(userfile, size=1000, events=1)
            newFile.setLocation(locations)
            newFile.addRun(Run(1, idx))
            newFile["block"] = 'UserFilesFakeBlock'
            newFile["first_event"] = 1
            newFile["last_event"] = 2
            userFileset.addFile(newFile)
            file_counter += 1
            if total_units > 0 and file_counter >= total_units:
                break

        return Result(task=kwargs['task'], result=userFileset)
Example #20
0
    def __init__(self, subscription=None, jobs=None):
        self.jobs = []
        self.newjobs = []
        self.id = 0

        if type(jobs) == list:
            self.newjobs = jobs
        elif jobs != None:
            self.newjobs = [jobs]

        self.subscription = subscription
        self.output = Fileset()
        self.last_update = datetime.datetime.now()
Example #21
0
    def __init__(self, subscription=None, jobs=None):
        self.jobs = []
        self.newjobs = []
        self.id = 0

        if isinstance(jobs, list):
            self.newjobs = jobs
        elif jobs is not None:
            self.newjobs = [jobs]

        self.subscription = subscription
        self.output = Fileset()
        self.last_update = datetime.datetime.now()
Example #22
0
    def testProductionRunNumber(self):
        """
        _testProductionRunNumber_

        Verify that jobs created by production subscritpions have the correct
        run number is their job mask.  Also verify that non-production
        subscriptions don't have modified run numbers.
        """
        testWorkflow = Workflow(spec="spec.pkl",
                                owner="Steve",
                                name="TestWorkflow",
                                task="TestTask")

        testFileset = Fileset(name="TestFileset")
        testFile = File(lfn="someLFN")
        testFileset.addFile(testFile)
        testFileset.commit()

        testSubscription = Subscription(fileset=testFileset,
                                        workflow=testWorkflow,
                                        split_algo="FileBased",
                                        type="Production")

        myJobFactory = JobFactory(subscription=testSubscription)
        testJobGroups = myJobFactory()

        self.assertTrue(len(testJobGroups) > 0)
        for testJobGroup in testJobGroups:
            self.assertTrue(len(testJobGroup.jobs) > 0)
            for job in testJobGroup.jobs:
                self.assertEqual(job["mask"]["FirstRun"], 1,
                                 "Error: First run is wrong.")
                self.assertEqual(job["mask"]["LastRun"], 1,
                                 "Error: Last run is wrong.")

        testSubscription = Subscription(fileset=testFileset,
                                        workflow=testWorkflow,
                                        split_algo="FileBased",
                                        type="Processing")

        myJobFactory = JobFactory(subscription=testSubscription)
        testJobGroups = myJobFactory()

        for testJobGroup in testJobGroups:
            for job in testJobGroup.jobs:
                self.assertEqual(job["mask"]["FirstRun"], None,
                                 "Error: First run is wrong.")
                self.assertEqual(job["mask"]["LastRun"], None,
                                 "Error: Last run is wrong.")

        return
Example #23
0
    def testAddOutput(self):
        """
        _testAddOutput_

        Tests the addOutput functionality of the DataStructs Workflow.
        """
        filesetA = Fileset(name="filesetA")
        filesetB = Fileset(name="filesetB")
        filesetC = Fileset(name="filesetC")

        testWorkflow = Workflow(spec="test", owner="mnorman")
        testWorkflow.addOutput("out1", filesetA, filesetB)
        testWorkflow.addOutput("out1", filesetB, filesetA)
        testWorkflow.addOutput("out2", filesetC)

        self.assertEqual(len(testWorkflow.outputMap["out1"]), 2,
                         "Error: There should be two mappings for out1.")
        self.assertEqual(len(testWorkflow.outputMap["out2"]), 1,
                         "Error: There should be two mappings for out2.")

        self.assertTrue({
            "output_fileset": filesetA,
            "merged_output_fileset": filesetB
        } in testWorkflow.outputMap["out1"],
                        "Error: Fileset A should be in the output map.")
        self.assertTrue({
            "output_fileset": filesetB,
            "merged_output_fileset": filesetA
        } in testWorkflow.outputMap["out1"],
                        "Error: Fileset B should be in the output map.")

        self.assertEqual(filesetC,
                         testWorkflow.outputMap["out2"][0]["output_fileset"],
                         "Error: Fileset C should be in the output map.")
        self.assertEqual(
            None, testWorkflow.outputMap["out2"][0]["merged_output_fileset"],
            "Error: The merged output should be None.")
        return
Example #24
0
    def testF_HardLimitSplittingOnly(self):
        """
        _testF_HardLimitSplittingOnly_

        Checks that we can split a set of files where every file has a single
        lumi too big to fit in a runnable job
        """
        splitter = SplitterFactory()

        #Create 3 single-big-lumi files
        testFileset = Fileset(name="FilesetA")
        testFileA = self.createFile("/this/is/file1", 1000, 0, 1,
                                    "somese.cern.ch")
        testFileB = self.createFile("/this/is/file2", 1000, 1, 1,
                                    "somese.cern.ch")
        testFileC = self.createFile("/this/is/file3", 1000, 2, 1,
                                    "somese.cern.ch")
        testFileset.addFile(testFileA)
        testFileset.addFile(testFileB)
        testFileset.addFile(testFileC)

        testSubscription = Subscription(fileset=testFileset,
                                        workflow=self.testWorkflow,
                                        split_algo="EventAwareLumiBased",
                                        type="Processing")
        jobFactory = splitter(package="WMCore.DataStructs",
                              subscription=testSubscription)
        #Settings are to split on job boundaries, to fail sing lumis with more than 800 events
        #and to put 550 events per job
        jobGroups = jobFactory(halt_job_on_file_boundaries=True,
                               splitOnRun=True,
                               events_per_job=550,
                               max_events_per_lumi=800,
                               performance=self.performanceParams)

        self.assertEqual(len(jobGroups), 1,
                         "There should be only one job group")
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 3, "Three jobs must be in the jobgroup")
        for i in range(1, 4):
            self.assertTrue(
                jobs[i - 1]['failedOnCreation'],
                "The job processing the second file should me marked for failure"
            )
            self.assertEqual(
                jobs[i - 1]['failedReason'],
                "File /this/is/file%d has too many events (1000) in 1 lumi(s)"
                % i, "The reason for the failure is not accurate")

        return
Example #25
0
    def algorithm(self, *args, **kwargs):
        """
        _algorithm_

        Implement merge algorithm for the subscription provided

        """
        fileset = list(self.subscription.availableFiles())

        mergeSize = int(kwargs['merge_size'])
        overflow = bool(kwargs.get('all_files', False))
        fileset.sort()

        accumSize = 0
        jobFiles = Fileset()
        locationDict = self.sortByLocation()
        for location in locationDict:
            baseName = makeUUID()
            self.newGroup()
            for f in locationDict[location]:
                accumSize += f['size']
                jobFiles.addFile(f)
                if accumSize >= mergeSize:
                    self.newJob(name='%s-%s' %
                                (baseName, len(self.currentGroup.jobs) + 1),
                                files=jobFiles)
                    self.currentJob["mask"].setMaxAndSkipEvents(-1, 0)
                    accumSize = 0
                    jobFiles = Fileset()

            if len(jobFiles) > 0:
                if overflow:
                    self.newJob(name='%s-%s' %
                                (baseName, len(self.currentGroup.jobs) + 1),
                                files=jobFiles)
                    self.currentJob["mask"].setMaxAndSkipEvents(-1, 0)
Example #26
0
class DataDiscovery(TaskAction):
    """I am the abstract class for the data discovery.
       Taking care of generalizing different data discovery
       possibilities. Implementing only a common method to
       return a properly formatted output."""

    def formatOutput(self, task, requestname, datasetfiles, locations):
        """Receives as input the result of the data location
           discovery operations and fill up the WMCore objects."""
        self.logger.debug(" Formatting data discovery output ")
        # TEMPORARY
        secmsmap = {}
        sbj = SiteDBJSON({"key":self.config.MyProxy.serverhostkey,
                          "cert":self.config.MyProxy.serverhostcert})

        wmfiles = []
        lumicounter = evecounter = 0
        for lfn, infos in datasetfiles.iteritems():
            wmfile = File(lfn=lfn, events=infos['NumberOfEvents'], size=infos['Size'], checksums=infos['Checksums'])
            wmfile['block'] = infos['BlockName']
            wmfile['locations'] = []
            if locations.has_key(infos['BlockName']):
                for se in locations[infos['BlockName']]:
                    if se not in secmsmap:
                        self.logger.debug("Translating SE %s" %se)
                        try:
                            secmsmap[se] = sbj.seToCMSName(se)
                        except KeyError, ke:
                            self.logger.error("Impossible translating %s to a CMS name through SiteDB" %se)
                            secmsmap[se] = ''
                    if se in secmsmap:
                        if type(secmsmap[se]) == list:
                            wmfile['locations'].extend(secmsmap[se])
                        else:
                            wmfile['locations'].append(secmsmap[se])
                wmfile['workflow'] = requestname
                evecounter += infos['NumberOfEvents']
                for run, lumis in infos['Lumis'].iteritems():
                    #self.logger.debug(' - adding run %d and lumis %s' %(run, lumis))
                    wmfile.addRun(Run(run, *lumis))
                    lumicounter += len(lumis)
                wmfiles.append(wmfile)

        self.logger.debug('Tot events found: %d' %evecounter)
        self.logger.debug('Tot lumis found: %d' %lumicounter)
        self.logger.debug('Tot files found: %d' %len(wmfiles))

        return Result(task=task, result=Fileset(name='FilesToSplit', files = set(wmfiles)))
Example #27
0
    def testHardLimitSplitting(self):
        """
        _testHardLimitSplitting_

        Test that we can specify a event limit, the
        algorithm shall take single lumi files with more events than the limit
        and mark them for failure
        """
        splitter = SplitterFactory()

        # Create 3 files, the one in the middle is a "bad" file
        testFileset = Fileset(name="FilesetA")
        testFileA = self.createFile("/this/is/file1", 1000, 0, 5, "blenheim")
        testFileB = self.createFile("/this/is/file2", 1000, 1, 1, "blenheim")
        testFileC = self.createFile("/this/is/file3", 1000, 2, 2, "blenheim")
        testFileset.addFile(testFileA)
        testFileset.addFile(testFileB)
        testFileset.addFile(testFileC)

        testSubscription = Subscription(fileset=testFileset,
                                        workflow=self.testWorkflow,
                                        split_algo="EventAwareLumiByWork",
                                        type="Processing")
        jobFactory = splitter(package="WMCore.DataStructs",
                              subscription=testSubscription)

        # Settings are to split on job boundaries, to fail single lumis with more than 800 events
        # and to put 550 events per job
        jobGroups = jobFactory(halt_job_on_file_boundaries=True,
                               splitOnRun=True,
                               events_per_job=550,
                               job_time_limit=9600,
                               performance=self.performanceParams)

        self.assertEqual(len(jobGroups), 1)
        jobs = jobGroups[0].jobs
        self.assertEqual(len(jobs), 5)

        # One job should be failed, the rest should be fine
        for jobNum in (0, 1, 3, 4):
            self.assertFalse(jobs[jobNum].get('failedOnCreation'))
        self.assertTrue(jobs[2]['failedOnCreation'])
        self.assertEqual(
            jobs[2]['failedReason'],
            'File /this/is/file2 has a single lumi 1, in run 1 with too many events 1000 and it woud take 12000 sec to run'
        )

        return
Example #28
0
 def generateFakeMCFile(self, numEvents = 100, firstEvent = 1,
                        lastEvent = 100, firstLumi = 1, lastLumi = 10):
     #MC comes with only one MCFakeFile
     singleMCFileset = Fileset(name = "MCTestFileset")
     newFile = File("MCFakeFileTest", size = 1000, events = numEvents)
     newFile.setLocation('se01')
     newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1)))
     newFile["first_event"] = firstEvent
     newFile["last_event"] = lastEvent
     testWorkflow = Workflow()
     singleMCFileset.addFile(newFile)
     singleMCFileSubscription = Subscription(fileset = singleMCFileset,
                                             workflow = testWorkflow,
                                             split_algo = "EventBased",
                                             type = "Production")
     return singleMCFileSubscription
    def execute(self, *args, **kwargs):
        self.logger.info(
            "Data discovery and splitting for %s using user-provided files" %
            kwargs['task']['tm_taskname'])

        userfiles = kwargs['task']['tm_user_files']
        splitting = kwargs['task']['tm_split_algo']
        total_units = kwargs['task']['tm_totalunits']
        if not userfiles or splitting != 'FileBased':
            if not userfiles:
                msg = "No files specified to process for task %s." % kwargs[
                    'task']['tm_taskname']
            if splitting != 'FileBased':
                msg = "Data.splitting must be set to 'FileBased' when using a custom set of files."
            raise TaskWorkerException(msg)

        if hasattr(self.config.Sites, 'available'):
            locations = self.config.Sites.available
        else:
            with self.config.TaskWorker.envForCMSWEB:
                configDict = {
                    "cacheduration": 1,
                    "pycurl": True
                }  # cache duration is in hours
                resourceCatalog = CRIC(logger=self.logger,
                                       configDict=configDict)
                locations = resourceCatalog.getAllPSNs()

        userFileset = Fileset(name=kwargs['task']['tm_taskname'])
        self.logger.info("There are %d files specified by the user." %
                         len(userfiles))
        if total_units > 0:
            self.logger.info("Will run over the first %d files." % total_units)
        file_counter = 0
        for userfile, idx in zip(userfiles, range(len(userfiles))):
            newFile = File(userfile, size=1000, events=1)
            newFile.setLocation(locations)
            newFile.addRun(Run(1, idx))
            newFile["block"] = 'UserFilesFakeBlock'
            newFile["first_event"] = 1
            newFile["last_event"] = 2
            userFileset.addFile(newFile)
            file_counter += 1
            if total_units > 0 and file_counter >= total_units:
                break

        return Result(task=kwargs['task'], result=userFileset)
Example #30
0
    def getFileset(self):
        """
        Get a fileset based on the task

        """

        fileset = Fileset(name='Merge%s' % (type))

        for i in range(0, random.randint(15, 25)):
            # Use the testDir to generate a random lfn
            inpFile = File(lfn="%s/%s.root" % (self.testDir, makeUUID()),
                           size=random.randint(200000, 1000000),
                           events=random.randint(1000, 2000))
            inpFile.setLocation('Megiddo')
            fileset.addFile(inpFile)

        return fileset