def __init__(self, fileset=None, workflow=None, whitelist=None, blacklist=None, split_algo="FileBased", type="Processing"): if fileset == None: fileset = Fileset() if whitelist == None: whitelist = set() if blacklist == None: blacklist = set() self.setdefault('fileset', fileset) self.setdefault('workflow', workflow) self.setdefault('type', type) self.setdefault('split_algo', split_algo) self.setdefault('whitelist', whitelist) self.setdefault('blacklist', blacklist) self.available = Fileset(name=fileset.name, files=fileset.getFiles()) self.acquired = Fileset(name='acquired') self.completed = Fileset(name='completed') self.failed = Fileset(name='failed')
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.setLocation('blenheim') newFile.setLocation('malpaquet') self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100) newFile.setLocation('blenheim') self.singleFileFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="FileBased", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="FileBased", type="Processing") #self.multipleFileSubscription.create() #self.singleFileSubscription.create() return
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.addRun(Run(i, *[45 + i])) self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100) newFile.addRun(Run(1, *[45])) self.singleFileFileset.addFile(newFile) self.multipleFileLumiset = Fileset(name="TestFileset3") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.addRun(Run(1, *[45 + i / 3])) self.multipleFileLumiset.addFile(newFile) self.singleLumiFileset = Fileset(name="TestFileset4") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.addRun(Run(1, *[45])) self.singleLumiFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="FixedDelay", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="FixedDelay", type="Processing") self.multipleLumiSubscription = Subscription( fileset=self.multipleFileLumiset, workflow=testWorkflow, split_algo="FixedDelay", type="Processing") self.singleLumiSubscription = Subscription( fileset=self.singleLumiFileset, workflow=testWorkflow, split_algo="FixedDelay", type="Processing") return
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100, locations=set(["somese.cern.ch"])) self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100, locations=set(["somese.cern.ch"])) self.singleFileFileset.addFile(newFile) self.multipleSiteFileset = Fileset(name="TestFileset3") for i in range(5): newFile = File(makeUUID(), size=1000, events=100, locations=set(["somese.cern.ch"])) newFile.setLocation("somese.cern.ch") self.multipleSiteFileset.addFile(newFile) for i in range(5): newFile = File(makeUUID(), size=1000, events=100) newFile.setLocation(["somese.cern.ch", "otherse.cern.ch"]) self.multipleSiteFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="SizeBased", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="SizeBased", type="Processing") self.multipleSiteSubscription = Subscription( fileset=self.multipleSiteFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") return
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.setLocation('se01') self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100) newFile.setLocation('se02') self.singleFileFileset.addFile(newFile) self.emptyFileFileset = Fileset(name="TestFileset3") newFile = File("/some/file/name", size=1000, events=0) newFile.setLocation('se03') self.emptyFileFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") self.emptyFileSubscription = Subscription( fileset=self.emptyFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") self.eventsPerJob = 100 self.performanceParams = { 'timePerEvent': None, 'memoryRequirement': 2300, 'sizePerEvent': 400 } return
def testCommit(self): """ Testcase for the commit method of the Fileset class """ localTestFileSet = Fileset('LocalTestFileset', self.initialSet) fsSize = len(localTestFileSet.getFiles(type="lfn")) #Dummy file to test fileTestCommit = File('/tmp/filetestcommit', 0000, 1, 1) #File is added to the newfiles attribute of localTestFileSet localTestFileSet.addFile(fileTestCommit) assert fsSize == len(localTestFileSet.getFiles(type = "lfn")) - 1, 'file not added'\ 'correctly to test fileset' newfilestemp = localTestFileSet.newfiles assert fileTestCommit in newfilestemp, 'test file not in the new files'\ 'list' #After commit, dummy file is supposed to move from newfiles to files localTestFileSet.commit() #First, testing if the new file is present at file set object attribute of the Fileset object assert newfilestemp.issubset(localTestFileSet.files), 'Test file not ' \ 'present at fileset.files - fileset.commit ' \ 'not working properly' #Second, testing if the newfile set object attribute is empty assert localTestFileSet.newfiles == set(), \ 'Test file not present at fileset.newfiles ' \ '- fileset.commit not working properly'
def testF_HardLimitSplittingOnly(self): """ _testF_HardLimitSplittingOnly_ Checks that we can split a set of files where every file has a single lumi too big to fit in a runnable job """ splitter = SplitterFactory() # Create 3 single-big-lumi files testFileset = Fileset(name="FilesetA") testFileA = self.createFile("/this/is/file1", 1000, 0, 1, "somese.cern.ch") testFileB = self.createFile("/this/is/file2", 1000, 1, 1, "somese.cern.ch") testFileC = self.createFile("/this/is/file3", 1000, 2, 1, "somese.cern.ch") testFileset.addFile(testFileA) testFileset.addFile(testFileB) testFileset.addFile(testFileC) testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="EventAwareLumiBased", type="Processing") jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription) # Settings are to split on job boundaries, to fail sing lumis with more than 800 events # and to put 550 events per job jobGroups = jobFactory(halt_job_on_file_boundaries=True, splitOnRun=True, events_per_job=550, job_time_limit=9600, performance=self.performanceParams) self.assertEqual(len(jobGroups), 1, "There should be only one job group") jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 3, "Three jobs must be in the jobgroup") for i in range(0, 3): self.assertTrue(jobs[i]['failedOnCreation'], "It should have been marked as failed") runNums = jobs[i]['mask']['runAndLumis'].keys() self.assertEqual(len(runNums), 1) lumiNums = jobs[i]['mask']['runAndLumis'].values()[0] self.assertEqual(len(lumiNums), 1) finalLumi = [] for pair in lumiNums: finalLumi.extend(range(pair[0], pair[1] + 1)) self.assertEqual(len(finalLumi), 1) self.assertEqual( jobs[i]['failedReason'], "File /this/is/file%d has a single lumi %s, in run %s with too many events 1000 and it woud take 12000 sec to run" % (i + 1, finalLumi[0], runNums[0])) return
def testHardLimitSplittingOnly(self): """ _testHardLimitSplittingOnly_ Checks that we can split a set of files where every file has a single lumi too big to fit in a runnable job """ splitter = SplitterFactory() # Create 3 single-big-lumi files testFileset = Fileset(name="FilesetA") testFileA = self.createFile("/this/is/file1", 1000, 0, 1, "somese.cern.ch") testFileB = self.createFile("/this/is/file2", 1000, 1, 1, "somese.cern.ch") testFileC = self.createFile("/this/is/file3", 1000, 2, 1, "somese.cern.ch") testFileset.addFile(testFileA) testFileset.addFile(testFileB) testFileset.addFile(testFileC) testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="EventAwareLumiByWork", type="Processing") jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription) # Fail single lumis with more than 800 events and put 550 events per job jobGroups = jobFactory(halt_job_on_file_boundaries=True, splitOnRun=True, events_per_job=550, job_time_limit=9600, performance=self.performanceParams) self.assertEqual(len(jobGroups), 1) jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 3) for job in jobs: self.assertTrue(job['failedOnCreation']) self.assertIn(' with too many events 1000 and it woud take 12000 sec to run', job['failedReason']) return
def createSubscription(self, nFiles, lumisPerFile, twoSites=False, nEventsPerFile=100): """ _createSubscription_ Create a subscription for testing """ baseName = makeUUID() testFileset = Fileset(name=baseName) for i in range(nFiles): newFile = self.createFile('%s_%i' % (baseName, i), nEventsPerFile, i, lumisPerFile, 'blenheim') testFileset.addFile(newFile) if twoSites: for i in range(nFiles): newFile = self.createFile('%s_%i_2' % (baseName, i), nEventsPerFile, i, lumisPerFile, 'malpaquet') testFileset.addFile(newFile) testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="EventAwareLumiBased", type="Processing") return testSubscription
def execute(self, *args, **kwargs): #pylint: disable=unused-argument # since https://github.com/dmwm/CRABServer/issues/5633 totalunits can be a float # but that would confuse WMCore, therefore cast to int totalevents = int(kwargs['task']['tm_totalunits']) firstEvent = 1 lastEvent = totalevents firstLumi = 1 lastLumi = 10 # Set a default of 100 events per lumi. This is set as a task # property, as the splitting considers it independently of the file # information provided by the fake dataset. if not kwargs['task']['tm_events_per_lumi']: kwargs['task']['tm_events_per_lumi'] = 100 #MC comes with only one MCFakeFile singleMCFileset = Fileset(name="MCFakeFileSet") newFile = File("MCFakeFile", size=1000, events=totalevents) newFile.setLocation(self.getListOfSites()) newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["block"] = 'MCFakeBlock' newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent singleMCFileset.addFile(newFile) return Result(task=kwargs['task'], result=singleMCFileset)
def setUp(self): """ Create a dummy fileset and populate it with random files, in order to use it for the testcase methods """ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M', filename=__file__.replace('.py','.log'), filemode='w') self.logger = logging.getLogger('FilesetClassTest') #Setup the initial testcase environment: initialfile = File('/tmp/lfn1',1000,1,1,1) self.initialSet = set() self.initialSet.add(initialfile) #Create a Fileset, containing a initial file on it. self.fileset = Fileset(name = 'self.fileset', files = self.initialSet) #Populate the fileset with random files for i in range(1,1000): lfn = '/store/data/%s/%s/file.root' % (random.randint(1000, 9999), random.randint(1000, 9999)) size = random.randint(1000, 2000) events = 1000 run = random.randint(0, 2000) lumi = random.randint(0, 8) file = File(lfn=lfn, size=size, events=events, checksums = {"cksum": "1"}) file.addRun(Run(run, *[lumi])) self.fileset.addFile(file)
def testG_LumiMask(self): """ _testG_LumiMask_ Test that we can use a lumi-mask to filter good runs/lumis. """ splitter = SplitterFactory() # Create 3 files with 100 events per lumi: # - file1 with 1 run of 8 lumis # - file2 with 2 runs of 2 lumis each # - file3 with 1 run of 5 lumis fileA = File(lfn="/this/is/file1", size=1000, events=800) fileB = File(lfn="/this/is/file2", size=1000, events=400) fileC = File(lfn="/this/is/file3", size=1000, events=500) lumiListA = [] for lumi in range(8): lumiListA.append(10 + lumi) fileA.addRun(Run(1, *lumiListA)) fileA.setLocation("somese.cern.ch") lumiListB1 = [] lumiListB2 = [] for lumi in range(2): lumiListB1.append(20 + lumi) lumiListB2.append(30 + lumi) fileB.addRun(Run(2, *lumiListB1)) fileB.addRun(Run(3, *lumiListB2)) fileB.setLocation("somese.cern.ch") lumiListC = [] for lumi in range(5): lumiListC.append(40 + lumi) fileC.addRun(Run(4, *lumiListC)) fileC.setLocation("somese.cern.ch") testFileset = Fileset(name='Fileset') testFileset.addFile(fileA) testFileset.addFile(fileB) testFileset.addFile(fileC) testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="EventAwareLumiBased", type="Processing") jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription) # Use a lumi-mask = {1: [[10,14]], 2: [[20,21]], 4: [[40,41]]} jobGroups = jobFactory(halt_job_on_file_boundaries=False, splitOnRun=False, events_per_job=850, runs=['1', '2', '4'], lumis=['10,14', '20,21', '40,41'], performance=self.performanceParams) self.assertEqual(len(jobGroups), 1, "There should be only one job group") jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 2, "Two jobs must be in the jobgroup") self.assertEqual(jobs[0]['mask'].getRunAndLumis(), {1: [[10, 14]], 2: [[20, 21]], 4: [[40, 40]]}) self.assertEqual(jobs[1]['mask'].getRunAndLumis(), {4: [[41, 41]]})
def execute(self, *args, **kwargs): totalevents = kwargs['task']['tm_totalunits'] firstEvent = 1 lastEvent = totalevents firstLumi = 1 lastLumi = 10 # Set a default of 100 events per lumi. This is set as a task # property, as the splitting considers it independently of the file # information provided by the fake dataset. if not kwargs['task']['tm_events_per_lumi']: kwargs['task']['tm_events_per_lumi'] = 100 #MC comes with only one MCFakeFile singleMCFileset = Fileset(name = "MCFakeFileSet") newFile = File("MCFakeFile", size = 1000, events = totalevents) if hasattr(self.config.Sites, 'available'): newFile.setLocation(self.config.Sites.available) else: sbj = SiteDBJSON({"key":self.config.TaskWorker.cmskey, "cert":self.config.TaskWorker.cmscert}) newFile.setLocation(sbj.getAllCMSNames()) newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["block"] = 'MCFackBlock' newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent singleMCFileset.addFile(newFile) return Result(task=kwargs['task'], result=singleMCFileset)
def generateFakeMCFile(self, numEvents=100, firstEvent=1, lastEvent=100, firstLumi=1, lastLumi=10, existingSub=None): # MC comes with only one MCFakeFile newFile = File("MCFakeFileTest", size=1000, events=numEvents) newFile.setLocation('se01') if firstLumi == lastLumi: newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) else: newFile.addRun(Run(1, *range(firstLumi, lastLumi))) newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent if existingSub is None: singleMCFileset = Fileset(name="MCTestFileset") singleMCFileset.addFile(newFile) testWorkflow = Workflow() existingSub = Subscription(fileset=singleMCFileset, workflow=testWorkflow, split_algo="EventBased", type="Production") else: existingSub['fileset'].addFile(newFile) return existingSub
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.setLocation('blenheim') newFile.setLocation('malpaquet') lumis = [] for lumi in range(20): lumis.append((i * 100) + lumi) newFile.addRun(Run(i, *lumis)) self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100) newFile.setLocation('blenheim') lumis = list(range(50, 60)) + list(range(70, 80)) newFile.addRun(Run(13, *lumis)) self.singleFileFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="FileBased", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="FileBased", type="Processing") #self.multipleFileSubscription.create() #self.singleFileSubscription.create() self.performanceParams = { 'timePerEvent': 12, 'memoryRequirement': 2300, 'sizePerEvent': 400 } return
def testCall(self): fileset = Fileset(name="FakeFeederTest") for i in range(1, 21): self.feeder([fileset]) set = fileset.getFiles(type="set") if len(set) > 0: file = set.pop() fileset.commit()
def processDataset(self): """ _processDataset_ Import the Dataset contents and create a set of jobs from it """ # // # // Now create the job definitions #// logging.debug("SplitSize = %s" % self.splitSize) logging.debug("AllowedSites = %s" % self.allowedSites) thefiles = Fileset(name='FilesToSplit') reader = DBSReader(self.dbsUrl) fileList = reader.dbs.listFiles( analysisDataset=self.inputDataset(), retriveList=['retrive_block', 'retrive_run']) blocks = {} for f in fileList: block = f['Block']['Name'] if not blocks.has_key(block): blocks[block] = reader.listFileBlockLocation(block) f['Block']['StorageElementList'].extend(blocks[block]) wmbsFile = File(f['LogicalFileName']) [wmbsFile['locations'].add(x) for x in blocks[block]] wmbsFile['block'] = block thefiles.addFile(wmbsFile) work = Workflow() subs = Subscription(fileset=thefiles, workflow=work, split_algo='FileBased', type="Processing") splitter = SplitterFactory() jobfactory = splitter(subs) jobs = jobfactory(files_per_job=self.splitSize) jobDefs = [] for job in jobs.jobs: #job.mask.setMaxAndSkipEvents(-1, 0) jobDef = JobDefinition() jobDef['LFNS'].extend(job.listLFNs()) jobDef['SkipEvents'] = 0 jobDef['MaxEvents'] = -1 [ jobDef['SENames'].extend(list(x['locations'])) for x in job.listFiles() ] jobDefs.append(jobDef) return jobDefs
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.setLocation('se01') self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100) newFile.setLocation('se02') self.singleFileFileset.addFile(newFile) self.emptyFileFileset = Fileset(name="TestFileset3") newFile = File("/some/file/name", size=1000, events=0) newFile.setdefault('se03') self.emptyFileFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") self.emptyFileSubscription = Subscription( fileset=self.emptyFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") return
def execute(self, *args, **kwargs): self.logger.info( "Data discovery and splitting for %s using user-provided files" % kwargs['task']['tm_taskname']) userfiles = kwargs['task']['tm_arguments'].get('userfiles') splitting = kwargs['task']['tm_split_algo'] total_units = kwargs['task']['tm_totalunits'] if not userfiles or splitting != 'FileBased': if not userfiles: msg = "No files specified to process for task %s." % kwargs[ 'task']['tm_taskname'] if splitting != 'FileBased': msg = "Data.splitting must be set to 'FileBased' when using a custom set of files." self.logger.error("Setting %s as failed: %s" % (kwargs['task']['tm_taskname'], msg)) configreq = { 'workflow': kwargs['task']['tm_taskname'], 'status': "FAILED", 'subresource': 'failure', 'failure': b64encode(msg) } self.server.post(self.resturi, data=urllib.urlencode(configreq)) raise StopHandler(msg) if hasattr(self.config.Sites, 'available'): locations = self.config.Sites.available else: sbj = SiteDBJSON({ "key": self.config.TaskWorker.cmskey, "cert": self.config.TaskWorker.cmscert }) locations = sbj.getAllCMSNames() userFileset = Fileset(name=kwargs['task']['tm_taskname']) self.logger.info("There are %d files specified by the user." % len(userfiles)) if total_units > 0: self.logger.info("Will run over the first %d files." % total_units) file_counter = 0 for userfile, idx in zip(userfiles, range(len(userfiles))): newFile = File(userfile, size=1000, events=1) newFile.setLocation(locations) newFile.addRun(Run(1, idx)) newFile["block"] = 'UserFilesFakeBlock' newFile["first_event"] = 1 newFile["last_event"] = 2 userFileset.addFile(newFile) file_counter += 1 if total_units > 0 and file_counter >= total_units: break return Result(task=kwargs['task'], result=userFileset)
def __init__(self, subscription=None, jobs=None): self.jobs = [] self.newjobs = [] self.id = 0 if type(jobs) == list: self.newjobs = jobs elif jobs != None: self.newjobs = [jobs] self.subscription = subscription self.output = Fileset() self.last_update = datetime.datetime.now()
def __init__(self, subscription=None, jobs=None): self.jobs = [] self.newjobs = [] self.id = 0 if isinstance(jobs, list): self.newjobs = jobs elif jobs is not None: self.newjobs = [jobs] self.subscription = subscription self.output = Fileset() self.last_update = datetime.datetime.now()
def testProductionRunNumber(self): """ _testProductionRunNumber_ Verify that jobs created by production subscritpions have the correct run number is their job mask. Also verify that non-production subscriptions don't have modified run numbers. """ testWorkflow = Workflow(spec="spec.pkl", owner="Steve", name="TestWorkflow", task="TestTask") testFileset = Fileset(name="TestFileset") testFile = File(lfn="someLFN") testFileset.addFile(testFile) testFileset.commit() testSubscription = Subscription(fileset=testFileset, workflow=testWorkflow, split_algo="FileBased", type="Production") myJobFactory = JobFactory(subscription=testSubscription) testJobGroups = myJobFactory() self.assertTrue(len(testJobGroups) > 0) for testJobGroup in testJobGroups: self.assertTrue(len(testJobGroup.jobs) > 0) for job in testJobGroup.jobs: self.assertEqual(job["mask"]["FirstRun"], 1, "Error: First run is wrong.") self.assertEqual(job["mask"]["LastRun"], 1, "Error: Last run is wrong.") testSubscription = Subscription(fileset=testFileset, workflow=testWorkflow, split_algo="FileBased", type="Processing") myJobFactory = JobFactory(subscription=testSubscription) testJobGroups = myJobFactory() for testJobGroup in testJobGroups: for job in testJobGroup.jobs: self.assertEqual(job["mask"]["FirstRun"], None, "Error: First run is wrong.") self.assertEqual(job["mask"]["LastRun"], None, "Error: Last run is wrong.") return
def testAddOutput(self): """ _testAddOutput_ Tests the addOutput functionality of the DataStructs Workflow. """ filesetA = Fileset(name="filesetA") filesetB = Fileset(name="filesetB") filesetC = Fileset(name="filesetC") testWorkflow = Workflow(spec="test", owner="mnorman") testWorkflow.addOutput("out1", filesetA, filesetB) testWorkflow.addOutput("out1", filesetB, filesetA) testWorkflow.addOutput("out2", filesetC) self.assertEqual(len(testWorkflow.outputMap["out1"]), 2, "Error: There should be two mappings for out1.") self.assertEqual(len(testWorkflow.outputMap["out2"]), 1, "Error: There should be two mappings for out2.") self.assertTrue({ "output_fileset": filesetA, "merged_output_fileset": filesetB } in testWorkflow.outputMap["out1"], "Error: Fileset A should be in the output map.") self.assertTrue({ "output_fileset": filesetB, "merged_output_fileset": filesetA } in testWorkflow.outputMap["out1"], "Error: Fileset B should be in the output map.") self.assertEqual(filesetC, testWorkflow.outputMap["out2"][0]["output_fileset"], "Error: Fileset C should be in the output map.") self.assertEqual( None, testWorkflow.outputMap["out2"][0]["merged_output_fileset"], "Error: The merged output should be None.") return
def testF_HardLimitSplittingOnly(self): """ _testF_HardLimitSplittingOnly_ Checks that we can split a set of files where every file has a single lumi too big to fit in a runnable job """ splitter = SplitterFactory() #Create 3 single-big-lumi files testFileset = Fileset(name="FilesetA") testFileA = self.createFile("/this/is/file1", 1000, 0, 1, "somese.cern.ch") testFileB = self.createFile("/this/is/file2", 1000, 1, 1, "somese.cern.ch") testFileC = self.createFile("/this/is/file3", 1000, 2, 1, "somese.cern.ch") testFileset.addFile(testFileA) testFileset.addFile(testFileB) testFileset.addFile(testFileC) testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="EventAwareLumiBased", type="Processing") jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription) #Settings are to split on job boundaries, to fail sing lumis with more than 800 events #and to put 550 events per job jobGroups = jobFactory(halt_job_on_file_boundaries=True, splitOnRun=True, events_per_job=550, max_events_per_lumi=800, performance=self.performanceParams) self.assertEqual(len(jobGroups), 1, "There should be only one job group") jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 3, "Three jobs must be in the jobgroup") for i in range(1, 4): self.assertTrue( jobs[i - 1]['failedOnCreation'], "The job processing the second file should me marked for failure" ) self.assertEqual( jobs[i - 1]['failedReason'], "File /this/is/file%d has too many events (1000) in 1 lumi(s)" % i, "The reason for the failure is not accurate") return
def algorithm(self, *args, **kwargs): """ _algorithm_ Implement merge algorithm for the subscription provided """ fileset = list(self.subscription.availableFiles()) mergeSize = int(kwargs['merge_size']) overflow = bool(kwargs.get('all_files', False)) fileset.sort() accumSize = 0 jobFiles = Fileset() locationDict = self.sortByLocation() for location in locationDict: baseName = makeUUID() self.newGroup() for f in locationDict[location]: accumSize += f['size'] jobFiles.addFile(f) if accumSize >= mergeSize: self.newJob(name='%s-%s' % (baseName, len(self.currentGroup.jobs) + 1), files=jobFiles) self.currentJob["mask"].setMaxAndSkipEvents(-1, 0) accumSize = 0 jobFiles = Fileset() if len(jobFiles) > 0: if overflow: self.newJob(name='%s-%s' % (baseName, len(self.currentGroup.jobs) + 1), files=jobFiles) self.currentJob["mask"].setMaxAndSkipEvents(-1, 0)
class DataDiscovery(TaskAction): """I am the abstract class for the data discovery. Taking care of generalizing different data discovery possibilities. Implementing only a common method to return a properly formatted output.""" def formatOutput(self, task, requestname, datasetfiles, locations): """Receives as input the result of the data location discovery operations and fill up the WMCore objects.""" self.logger.debug(" Formatting data discovery output ") # TEMPORARY secmsmap = {} sbj = SiteDBJSON({"key":self.config.MyProxy.serverhostkey, "cert":self.config.MyProxy.serverhostcert}) wmfiles = [] lumicounter = evecounter = 0 for lfn, infos in datasetfiles.iteritems(): wmfile = File(lfn=lfn, events=infos['NumberOfEvents'], size=infos['Size'], checksums=infos['Checksums']) wmfile['block'] = infos['BlockName'] wmfile['locations'] = [] if locations.has_key(infos['BlockName']): for se in locations[infos['BlockName']]: if se not in secmsmap: self.logger.debug("Translating SE %s" %se) try: secmsmap[se] = sbj.seToCMSName(se) except KeyError, ke: self.logger.error("Impossible translating %s to a CMS name through SiteDB" %se) secmsmap[se] = '' if se in secmsmap: if type(secmsmap[se]) == list: wmfile['locations'].extend(secmsmap[se]) else: wmfile['locations'].append(secmsmap[se]) wmfile['workflow'] = requestname evecounter += infos['NumberOfEvents'] for run, lumis in infos['Lumis'].iteritems(): #self.logger.debug(' - adding run %d and lumis %s' %(run, lumis)) wmfile.addRun(Run(run, *lumis)) lumicounter += len(lumis) wmfiles.append(wmfile) self.logger.debug('Tot events found: %d' %evecounter) self.logger.debug('Tot lumis found: %d' %lumicounter) self.logger.debug('Tot files found: %d' %len(wmfiles)) return Result(task=task, result=Fileset(name='FilesToSplit', files = set(wmfiles)))
def testHardLimitSplitting(self): """ _testHardLimitSplitting_ Test that we can specify a event limit, the algorithm shall take single lumi files with more events than the limit and mark them for failure """ splitter = SplitterFactory() # Create 3 files, the one in the middle is a "bad" file testFileset = Fileset(name="FilesetA") testFileA = self.createFile("/this/is/file1", 1000, 0, 5, "blenheim") testFileB = self.createFile("/this/is/file2", 1000, 1, 1, "blenheim") testFileC = self.createFile("/this/is/file3", 1000, 2, 2, "blenheim") testFileset.addFile(testFileA) testFileset.addFile(testFileB) testFileset.addFile(testFileC) testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="EventAwareLumiByWork", type="Processing") jobFactory = splitter(package="WMCore.DataStructs", subscription=testSubscription) # Settings are to split on job boundaries, to fail single lumis with more than 800 events # and to put 550 events per job jobGroups = jobFactory(halt_job_on_file_boundaries=True, splitOnRun=True, events_per_job=550, job_time_limit=9600, performance=self.performanceParams) self.assertEqual(len(jobGroups), 1) jobs = jobGroups[0].jobs self.assertEqual(len(jobs), 5) # One job should be failed, the rest should be fine for jobNum in (0, 1, 3, 4): self.assertFalse(jobs[jobNum].get('failedOnCreation')) self.assertTrue(jobs[2]['failedOnCreation']) self.assertEqual( jobs[2]['failedReason'], 'File /this/is/file2 has a single lumi 1, in run 1 with too many events 1000 and it woud take 12000 sec to run' ) return
def generateFakeMCFile(self, numEvents = 100, firstEvent = 1, lastEvent = 100, firstLumi = 1, lastLumi = 10): #MC comes with only one MCFakeFile singleMCFileset = Fileset(name = "MCTestFileset") newFile = File("MCFakeFileTest", size = 1000, events = numEvents) newFile.setLocation('se01') newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent testWorkflow = Workflow() singleMCFileset.addFile(newFile) singleMCFileSubscription = Subscription(fileset = singleMCFileset, workflow = testWorkflow, split_algo = "EventBased", type = "Production") return singleMCFileSubscription
def execute(self, *args, **kwargs): self.logger.info( "Data discovery and splitting for %s using user-provided files" % kwargs['task']['tm_taskname']) userfiles = kwargs['task']['tm_user_files'] splitting = kwargs['task']['tm_split_algo'] total_units = kwargs['task']['tm_totalunits'] if not userfiles or splitting != 'FileBased': if not userfiles: msg = "No files specified to process for task %s." % kwargs[ 'task']['tm_taskname'] if splitting != 'FileBased': msg = "Data.splitting must be set to 'FileBased' when using a custom set of files." raise TaskWorkerException(msg) if hasattr(self.config.Sites, 'available'): locations = self.config.Sites.available else: with self.config.TaskWorker.envForCMSWEB: configDict = { "cacheduration": 1, "pycurl": True } # cache duration is in hours resourceCatalog = CRIC(logger=self.logger, configDict=configDict) locations = resourceCatalog.getAllPSNs() userFileset = Fileset(name=kwargs['task']['tm_taskname']) self.logger.info("There are %d files specified by the user." % len(userfiles)) if total_units > 0: self.logger.info("Will run over the first %d files." % total_units) file_counter = 0 for userfile, idx in zip(userfiles, range(len(userfiles))): newFile = File(userfile, size=1000, events=1) newFile.setLocation(locations) newFile.addRun(Run(1, idx)) newFile["block"] = 'UserFilesFakeBlock' newFile["first_event"] = 1 newFile["last_event"] = 2 userFileset.addFile(newFile) file_counter += 1 if total_units > 0 and file_counter >= total_units: break return Result(task=kwargs['task'], result=userFileset)
def getFileset(self): """ Get a fileset based on the task """ fileset = Fileset(name='Merge%s' % (type)) for i in range(0, random.randint(15, 25)): # Use the testDir to generate a random lfn inpFile = File(lfn="%s/%s.root" % (self.testDir, makeUUID()), size=random.randint(200000, 1000000), events=random.randint(1000, 2000)) inpFile.setLocation('Megiddo') fileset.addFile(inpFile) return fileset