def testAddOutput(self): """ _testAddOutput_ Tests the addOutput functionality of the DataStructs Workflow. """ filesetA = Fileset(name = "filesetA") filesetB = Fileset(name = "filesetB") filesetC = Fileset(name = "filesetC") testWorkflow = Workflow(spec = "test", owner = "mnorman") testWorkflow.addOutput("out1", filesetA, filesetB) testWorkflow.addOutput("out1", filesetB, filesetA) testWorkflow.addOutput("out2", filesetC) self.assertEqual(len(testWorkflow.outputMap["out1"]), 2, "Error: There should be two mappings for out1.") self.assertEqual(len(testWorkflow.outputMap["out2"]), 1, "Error: There should be two mappings for out2.") self.assertTrue({"output_fileset": filesetA, "merged_output_fileset": filesetB} in testWorkflow.outputMap["out1"], "Error: Fileset A should be in the output map.") self.assertTrue({"output_fileset": filesetB, "merged_output_fileset": filesetA} in testWorkflow.outputMap["out1"], "Error: Fileset B should be in the output map.") self.assertEqual(filesetC, testWorkflow.outputMap["out2"][0]["output_fileset"], "Error: Fileset C should be in the output map.") self.assertEqual(None, testWorkflow.outputMap["out2"][0]["merged_output_fileset"], "Error: The merged output should be None.") return
def __init__( self, spec=None, owner="unknown", dn="unknown", group="unknown", owner_vogroup="DEFAULT", owner_vorole="DEFAULT", name=None, task=None, wfType=None, id=-1, ): WMBSBase.__init__(self) WMWorkflow.__init__( self, spec=spec, owner=owner, dn=dn, group=group, owner_vogroup=owner_vogroup, owner_vorole=owner_vorole, name=name, task=task, wfType=wfType, ) if self.dn == "unknown": self.dn = owner self.id = id return
def __init__(self, spec=None, owner="unknown", dn="unknown", group="unknown", owner_vogroup="DEFAULT", owner_vorole="DEFAULT", name=None, task=None, wfType=None, id=-1, alternativeFilesetClose=False, priority=None): WMBSBase.__init__(self) WMWorkflow.__init__(self, spec=spec, owner=owner, dn=dn, group=group, owner_vogroup=owner_vogroup, owner_vorole=owner_vorole, name=name, task=task, wfType=wfType, priority=priority) if self.dn == "unknown": self.dn = owner self.id = id self.alternativeFilesetClose = alternativeFilesetClose return
def makeWorkflow(self): """ _makeWorkflow_ Create a WMBS compatible Workflow structure that represents this task and the information contained within it """ workflow = DataStructsWorkflow() workflow.task = self.getPathName() return workflow
def __init__(self, spec = None, owner = None, dn = None, group = None, owner_vogroup = '', owner_vorole = '', name = None, task = None, wfType = None, id = -1): WMBSBase.__init__(self) WMWorkflow.__init__(self, spec = spec, owner = owner, dn = dn, group = group, owner_vogroup = owner_vogroup, owner_vorole = owner_vorole, name = name, task = task, wfType = wfType) if not self.dn: self.dn = owner self.id = id return
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.setLocation('blenheim') newFile.setLocation('malpaquet') self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100) newFile.setLocation('blenheim') self.singleFileFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="FileBased", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="FileBased", type="Processing") #self.multipleFileSubscription.create() #self.singleFileSubscription.create() return
def __init__(self, spec = None, owner = "unknown", dn = "unknown", group = "unknown", owner_vogroup = "DEFAULT", owner_vorole = "DEFAULT", name = None, task = None, wfType = None, id = -1, alternativeFilesetClose = False): WMBSBase.__init__(self) WMWorkflow.__init__(self, spec = spec, owner = owner, dn = dn, group = group, owner_vogroup = owner_vogroup, owner_vorole = owner_vorole, name = name, task = task, wfType = wfType) if self.dn == "unknown": self.dn = owner self.id = id self.alternativeFilesetClose = alternativeFilesetClose return
def execute(self, *args, **kwargs): wmwork = Workflow(name=kwargs['task']['tm_taskname']) wmsubs = Subscription(fileset=args[0], workflow=wmwork, split_algo=kwargs['task']['tm_split_algo'], type=self.jobtypeMapper[kwargs['task']['tm_job_type']]) splitter = SplitterFactory() jobfactory = splitter(subscription=wmsubs) splitparam = kwargs['task']['tm_split_args'] splitparam['algorithm'] = kwargs['task']['tm_split_algo'] factory = jobfactory(**splitparam) if len(factory) == 0: # Understanding that no jobs could be created given the splitting arguments # with the given input dataset information: NO IDEA WHY. # NB: we assume that split can't happen, then task is failed msg = "Splitting %s on %s with %s does not generate any job" %(kwargs['task']['tm_taskname'], kwargs['task']['tm_input_dataset'], kwargs['task']['tm_split_algo']) self.logger.error("Setting %s as failed" % str(kwargs['task']['tm_taskname'])) configreq = {'workflow': kwargs['task']['tm_taskname'], 'status': "FAILED", 'subresource': 'failure', 'failure': b64encode(msg)} self.server.post(self.resturl, data = urllib.urlencode(configreq)) raise StopHandler(msg) return Result(task=kwargs['task'], result=factory)
def generateFakeMCFile(self, numEvents=100, firstEvent=1, lastEvent=100, firstLumi=1, lastLumi=10, existingSub=None): # MC comes with only one MCFakeFile newFile = File("MCFakeFileTest", size=1000, events=numEvents) newFile.setLocation('se01') if firstLumi == lastLumi: newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) else: newFile.addRun(Run(1, *range(firstLumi, lastLumi))) newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent if existingSub is None: singleMCFileset = Fileset(name="MCTestFileset") singleMCFileset.addFile(newFile) testWorkflow = Workflow() existingSub = Subscription(fileset=singleMCFileset, workflow=testWorkflow, split_algo="EventBased", type="Production") else: existingSub['fileset'].addFile(newFile) return existingSub
def processDataset(self): """ _processDataset_ Import the Dataset contents and create a set of jobs from it """ # // # // Now create the job definitions #// logging.debug("SplitSize = %s" % self.splitSize) logging.debug("AllowedSites = %s" % self.allowedSites) thefiles = Fileset(name='FilesToSplit') reader = DBSReader(self.dbsUrl) fileList = reader.dbs.listFiles( analysisDataset=self.inputDataset(), retriveList=['retrive_block', 'retrive_run']) blocks = {} for f in fileList: block = f['Block']['Name'] if not blocks.has_key(block): blocks[block] = reader.listFileBlockLocation(block) f['Block']['StorageElementList'].extend(blocks[block]) wmbsFile = File(f['LogicalFileName']) [wmbsFile['locations'].add(x) for x in blocks[block]] wmbsFile['block'] = block thefiles.addFile(wmbsFile) work = Workflow() subs = Subscription(fileset=thefiles, workflow=work, split_algo='FileBased', type="Processing") splitter = SplitterFactory() jobfactory = splitter(subs) jobs = jobfactory(files_per_job=self.splitSize) jobDefs = [] for job in jobs.jobs: #job.mask.setMaxAndSkipEvents(-1, 0) jobDef = JobDefinition() jobDef['LFNS'].extend(job.listLFNs()) jobDef['SkipEvents'] = 0 jobDef['MaxEvents'] = -1 [ jobDef['SENames'].extend(list(x['locations'])) for x in job.listFiles() ] jobDefs.append(jobDef) return jobDefs
def execute(self, *args, **kwargs): wmwork = Workflow(name=kwargs['task']['tm_taskname']) wmsubs = Subscription( fileset=args[0], workflow=wmwork, split_algo=kwargs['task']['tm_split_algo'], type=self.jobtypeMapper[kwargs['task']['tm_job_type']]) splitter = SplitterFactory() jobfactory = splitter(subscription=wmsubs) splitparam = kwargs['task']['tm_split_args'] splitparam['algorithm'] = kwargs['task']['tm_split_algo'] if kwargs['task']['tm_job_type'] == 'Analysis': if kwargs['task']['tm_split_algo'] == 'FileBased': splitparam['total_files'] = kwargs['task']['tm_totalunits'] elif kwargs['task']['tm_split_algo'] == 'LumiBased': splitparam['total_lumis'] = kwargs['task']['tm_totalunits'] elif kwargs['task']['tm_job_type'] == 'PrivateMC': if 'tm_events_per_lumi' in kwargs['task'] and kwargs['task'][ 'tm_events_per_lumi']: splitparam['events_per_lumi'] = kwargs['task'][ 'tm_events_per_lumi'] if 'tm_generator' in kwargs['task'] and kwargs['task'][ 'tm_generator'] == 'lhe': splitparam['lheInputFiles'] = True splitparam['applyLumiCorrection'] = True factory = jobfactory(**splitparam) if len(factory) == 0: raise TaskWorkerException("The CRAB3 server backend could not submit any job to the Grid scheduler:\n"+\ "splitting task %s on dataset %s with %s method does not generate any job") #printing duplicated lumis if any lumiChecker = getattr(jobfactory, 'lumiChecker', None) if lumiChecker and lumiChecker.splitLumiFiles: self.logger.warning( "The input dataset contains the following duplicated lumis %s" % lumiChecker.splitLumiFiles.keys()) try: configreq = { 'subresource': 'addwarning', 'workflow': kwargs['task']['tm_taskname'], 'warning': b64encode( 'The CRAB3 server backend detected lumis split across files in the input dataset.' ' Will apply the necessary corrections in the splitting algorithms' ) } self.server.post(self.restURInoAPI + '/task', data=urllib.urlencode(configreq)) except Exception, e: self.logger.error(e.headers) self.logger.warning( "Cannot add warning to REST after finding duplicates")
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.testWorkflow = Workflow() return
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.addRun(Run(i, *[45 + i])) self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100) newFile.addRun(Run(1, *[45])) self.singleFileFileset.addFile(newFile) self.multipleFileLumiset = Fileset(name="TestFileset3") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.addRun(Run(1, *[45 + i / 3])) self.multipleFileLumiset.addFile(newFile) self.singleLumiFileset = Fileset(name="TestFileset4") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.addRun(Run(1, *[45])) self.singleLumiFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="FixedDelay", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="FixedDelay", type="Processing") self.multipleLumiSubscription = Subscription( fileset=self.multipleFileLumiset, workflow=testWorkflow, split_algo="FixedDelay", type="Processing") self.singleLumiSubscription = Subscription( fileset=self.singleLumiFileset, workflow=testWorkflow, split_algo="FixedDelay", type="Processing") return
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100, locations=set(["somese.cern.ch"])) self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100, locations=set(["somese.cern.ch"])) self.singleFileFileset.addFile(newFile) self.multipleSiteFileset = Fileset(name="TestFileset3") for i in range(5): newFile = File(makeUUID(), size=1000, events=100, locations=set(["somese.cern.ch"])) newFile.setLocation("somese.cern.ch") self.multipleSiteFileset.addFile(newFile) for i in range(5): newFile = File(makeUUID(), size=1000, events=100) newFile.setLocation(["somese.cern.ch", "otherse.cern.ch"]) self.multipleSiteFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="SizeBased", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="SizeBased", type="Processing") self.multipleSiteSubscription = Subscription( fileset=self.multipleSiteFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") return
def testProductionRunNumber(self): """ _testProductionRunNumber_ Verify that jobs created by production subscritpions have the correct run number is their job mask. Also verify that non-production subscriptions don't have modified run numbers. """ testWorkflow = Workflow(spec="spec.pkl", owner="Steve", name="TestWorkflow", task="TestTask") testFileset = Fileset(name="TestFileset") testFile = File(lfn="someLFN") testFileset.addFile(testFile) testFileset.commit() testSubscription = Subscription(fileset=testFileset, workflow=testWorkflow, split_algo="FileBased", type="Production") myJobFactory = JobFactory(subscription=testSubscription) testJobGroups = myJobFactory() self.assertTrue(len(testJobGroups) > 0) for testJobGroup in testJobGroups: self.assertTrue(len(testJobGroup.jobs) > 0) for job in testJobGroup.jobs: self.assertEqual(job["mask"]["FirstRun"], 1, "Error: First run is wrong.") self.assertEqual(job["mask"]["LastRun"], 1, "Error: Last run is wrong.") testSubscription = Subscription(fileset=testFileset, workflow=testWorkflow, split_algo="FileBased", type="Processing") myJobFactory = JobFactory(subscription=testSubscription) testJobGroups = myJobFactory() for testJobGroup in testJobGroups: for job in testJobGroup.jobs: self.assertEqual(job["mask"]["FirstRun"], None, "Error: First run is wrong.") self.assertEqual(job["mask"]["LastRun"], None, "Error: Last run is wrong.") return
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.testWorkflow = Workflow() self.performanceParams = {'timePerEvent': 12, 'memoryRequirement': 2300, 'sizePerEvent': 400} return
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.setLocation('se01') self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100) newFile.setLocation('se02') self.singleFileFileset.addFile(newFile) self.emptyFileFileset = Fileset(name="TestFileset3") newFile = File("/some/file/name", size=1000, events=0) newFile.setLocation('se03') self.emptyFileFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") self.emptyFileSubscription = Subscription( fileset=self.emptyFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") self.eventsPerJob = 100 self.performanceParams = { 'timePerEvent': None, 'memoryRequirement': 2300, 'sizePerEvent': 400 } return
def generateFakeMCFile(self, numEvents = 100, firstEvent = 1, lastEvent = 100, firstLumi = 1, lastLumi = 10): #MC comes with only one MCFakeFile singleMCFileset = Fileset(name = "MCTestFileset") newFile = File("MCFakeFileTest", size = 1000, events = numEvents) newFile.setLocation('se01') newFile.addRun(Run(1, *range(firstLumi, lastLumi + 1))) newFile["first_event"] = firstEvent newFile["last_event"] = lastEvent testWorkflow = Workflow() singleMCFileset.addFile(newFile) singleMCFileSubscription = Subscription(fileset = singleMCFileset, workflow = testWorkflow, split_algo = "EventBased", type = "Production") return singleMCFileSubscription
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.setLocation('blenheim') newFile.setLocation('malpaquet') lumis = [] for lumi in range(20): lumis.append((i * 100) + lumi) newFile.addRun(Run(i, *lumis)) self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100) newFile.setLocation('blenheim') lumis = list(range(50, 60)) + list(range(70, 80)) newFile.addRun(Run(13, *lumis)) self.singleFileFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="FileBased", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="FileBased", type="Processing") #self.multipleFileSubscription.create() #self.singleFileSubscription.create() self.performanceParams = { 'timePerEvent': 12, 'memoryRequirement': 2300, 'sizePerEvent': 400 } return
def setUp(self): """ Initial Setup for Subscription Testcase Set a dummy Subscription with a fileset composed of one file inside it and a dummy workflow using the default constructor of the Workflow class """ self.dummyFile = File('/tmp/dummyfile', 9999, 0, 0, 0) self.dummySet = set() self.dummySet.add(self.dummyFile) self.dummyFileSet = Fileset(name='SubscriptionTestFileset', files=self.dummySet) self.dummyWorkFlow = Workflow() self.dummySubscription = Subscription(fileset=self.dummyFileSet, workflow=self.dummyWorkFlow) return
def testDefinition(self): """ Tests to make sure Workflow is defined correctly """ testSpec = "test" testOwner = "mnorman" testName = "testName" testWorkflow = Workflow(spec=testSpec, owner=testOwner, name=testName) self.assertEqual(testWorkflow.spec, testSpec) self.assertEqual(testWorkflow.owner, testOwner) self.assertEqual(testWorkflow.name, testName) return
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.testWorkflow = Workflow() self.performanceParams = {'timePerEvent': 12, 'memoryRequirement': 2300, 'sizePerEvent': 400} logging.basicConfig() logging.getLogger().setLevel(logging.DEBUG) if PY3: self.assertItemsEqual = self.assertCountEqual return
def setUp(self): """ _setUp_ Create two subscriptions: One that contains a single file and one that contains multiple files. """ self.multipleFileFileset = Fileset(name="TestFileset1") for i in range(10): newFile = File(makeUUID(), size=1000, events=100) newFile.setLocation('se01') self.multipleFileFileset.addFile(newFile) self.singleFileFileset = Fileset(name="TestFileset2") newFile = File("/some/file/name", size=1000, events=100) newFile.setLocation('se02') self.singleFileFileset.addFile(newFile) self.emptyFileFileset = Fileset(name="TestFileset3") newFile = File("/some/file/name", size=1000, events=0) newFile.setdefault('se03') self.emptyFileFileset.addFile(newFile) testWorkflow = Workflow() self.multipleFileSubscription = Subscription( fileset=self.multipleFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") self.singleFileSubscription = Subscription( fileset=self.singleFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") self.emptyFileSubscription = Subscription( fileset=self.emptyFileFileset, workflow=testWorkflow, split_algo="EventBased", type="Processing") return
def testMetaData(self): """ _testMetaData_ Make sure that the workflow name, task, owner and white and black lists make it into each job object. """ testWorkflow = Workflow(spec="spec.pkl", owner="Steve", name="TestWorkflow", task="TestTask") testFileset = Fileset(name="TestFileset") testFile = File(lfn="someLFN") testFileset.addFile(testFile) testFileset.commit() testSubscription = Subscription(fileset=testFileset, workflow=testWorkflow, split_algo="FileBased") myJobFactory = JobFactory(subscription=testSubscription) testJobGroups = myJobFactory(siteWhitelist=["site1"], siteBlacklist=["site2"]) self.assertTrue(len(testJobGroups) > 0) for testJobGroup in testJobGroups: self.assertTrue(len(testJobGroup.jobs) > 0) for job in testJobGroup.jobs: self.assertEqual(job["task"], "TestTask", "Error: Task is wrong.") self.assertEqual(job["workflow"], "TestWorkflow", "Error: Workflow is wrong.") self.assertEqual(job["owner"], "Steve", "Error: Owner is wrong.") self.assertEqual(job["siteWhitelist"], ["site1"], "Error: Site white list is wrong.") self.assertEqual(job["siteBlacklist"], ["site2"], "Error: Site black list is wrong.") return
def oneHundredFiles(self, splittingAlgo="EventBased", jobType="Processing"): """ _oneHundredFiles_ Generate a WMBS data stack representing 100 files for job splitter testing """ fileset1 = Fileset(name='EventBasedFiles1') for i in range(0, 100): f = File( "/store/MultipleFileSplit%s.root" % i, # lfn 1000, # size 100, # events 10 + i, # run 12312 # lumi ) f['locations'].add("BULLSHIT") fileset1.addFile(f) work = Workflow() subscription1 = Subscription(fileset=fileset1, workflow=work, split_algo=splittingAlgo, type=jobType) splitter = SplitterFactory() jobfactory = splitter(subscription1) jobs = jobfactory(events_per_job=100) #for jobGroup in jobs: # yield jobGroup self.manager.addGenerator("RandomSeeder", **self.seedlistForRandom) self.manager.addGenerator("RunAndLumiSeeder") return jobs
def testAddOutput(self): """ _testAddOutput_ Tests the addOutput functionality of the DataStructs Workflow. """ filesetA = Fileset(name="filesetA") filesetB = Fileset(name="filesetB") filesetC = Fileset(name="filesetC") testWorkflow = Workflow(spec="test", owner="mnorman") testWorkflow.addOutput("out1", filesetA, filesetB) testWorkflow.addOutput("out1", filesetB, filesetA) testWorkflow.addOutput("out2", filesetC) self.assertEqual(len(testWorkflow.outputMap["out1"]), 2, "Error: There should be two mappings for out1.") self.assertEqual(len(testWorkflow.outputMap["out2"]), 1, "Error: There should be two mappings for out2.") self.assertTrue({ "output_fileset": filesetA, "merged_output_fileset": filesetB } in testWorkflow.outputMap["out1"], "Error: Fileset A should be in the output map.") self.assertTrue({ "output_fileset": filesetB, "merged_output_fileset": filesetA } in testWorkflow.outputMap["out1"], "Error: Fileset B should be in the output map.") self.assertEqual(filesetC, testWorkflow.outputMap["out2"][0]["output_fileset"], "Error: Fileset C should be in the output map.") self.assertEqual( None, testWorkflow.outputMap["out2"][0]["merged_output_fileset"], "Error: The merged output should be None.") return
def processDataset(self): """ _processDataset_ Import the Dataset contents and create a set of jobs from it """ # // # // Now create the job definitions #// logging.debug("MergeSize = %s" % self.mergeSize) logging.debug("AllowedSites = %s" % self.allowedSites) logging.debug("Connection to DBS at: %s" % self.dbsUrl) reader = DBSReader(self.dbsUrl) blockList = reader.dbs.listBlocks(dataset=self.inputDataset()) jobDefs = [] for block in blockList: blockName = block['Name'] logging.debug("Getting files for block %s" % blockName) locations = reader.listFileBlockLocation(blockName) fileList = reader.dbs.listFiles(blockName=blockName) if not fileList: # Skip empty blocks continue thefiles = Fileset(name='FilesToSplit') for f in fileList: f['Block']['StorageElementList'].extend(locations) wmbsFile = File(f['LogicalFileName']) [wmbsFile['locations'].add(x) for x in locations] wmbsFile['block'] = blockName wmbsFile['size'] = f['FileSize'] thefiles.addFile(wmbsFile) work = Workflow() subs = Subscription(fileset=thefiles, workflow=work, split_algo='MergeBySize', type="Merge") logging.debug("Info for Subscription %s" % subs) splitter = SplitterFactory() jobfactory = splitter(subs) jobGroups = jobfactory( merge_size=self.mergeSize, # min in Bytes all_files=True # merge all files ) if not jobGroups: raise (SyntaxError) for jobGroup in jobGroups: for job in jobGroup.getJobs(): jobDef = JobDefinition() jobDef['LFNS'].extend(job.getFiles(type='lfn')) jobDef['SkipEvents'] = 0 jobDef['MaxEvents'] = -1 [ jobDef['SENames'].extend(list(x['locations'])) for x in job.getFiles() ] jobDefs.append(jobDef) return jobDefs
def execute(self, *args, **kwargs): wmwork = Workflow(name=kwargs['task']['tm_taskname']) maxJobs = getattr(self.config.TaskWorker, 'maxJobsPerTask', 10000) data = args[0] splitparam = kwargs['task']['tm_split_args'] splitparam['algorithm'] = kwargs['task']['tm_split_algo'] if kwargs['task']['tm_job_type'] == 'Analysis': totalUnits = kwargs['task']['tm_totalunits'] if kwargs['task']['tm_split_algo'] == 'FileBased': if totalUnits < 1.0: totalUnits = int(totalUnits * len(data.getFiles()) + 0.5) splitparam['total_files'] = totalUnits elif kwargs['task']['tm_split_algo'] == 'LumiBased': if totalUnits < 1.0: totalUnits = int(totalUnits * sum(len(run.lumis) for f in data.getFiles() for run in f['runs']) + 0.5) splitparam['total_lumis'] = totalUnits elif kwargs['task']['tm_split_algo'] == 'EventAwareLumiBased': if totalUnits < 1.0: totalUnits = int(totalUnits * sum(f['events'] for f in data.getFiles()) + 0.5) splitparam['total_events'] = totalUnits elif kwargs['task']['tm_split_algo'] == 'Automatic': # REST backwards compatibility fix if 'seconds_per_job' in kwargs['task']['tm_split_args']: kwargs['task']['tm_split_args']['minutes_per_job'] = kwargs['task']['tm_split_args'].pop('seconds_per_job') splitparam['algorithm'] = 'FileBased' splitparam['total_files'] = len(data.getFiles()) numProbes = getattr(self.config.TaskWorker, 'numAutomaticProbes', 5) splitparam['files_per_job'] = (len(data.getFiles()) + numProbes - 1) // numProbes elif kwargs['task']['tm_job_type'] == 'PrivateMC': if 'tm_events_per_lumi' in kwargs['task'] and kwargs['task']['tm_events_per_lumi']: splitparam['events_per_lumi'] = kwargs['task']['tm_events_per_lumi'] if 'tm_generator' in kwargs['task'] and kwargs['task']['tm_generator'] == 'lhe': splitparam['lheInputFiles'] = True splitparam['applyLumiCorrection'] = True wmsubs = Subscription(fileset=data, workflow=wmwork, split_algo=splitparam['algorithm'], type=self.jobtypeMapper[kwargs['task']['tm_job_type']]) try: splitter = SplitterFactory() jobfactory = splitter(subscription=wmsubs) factory = jobfactory(**splitparam) numJobs = sum([len(jobgroup.getJobs()) for jobgroup in factory]) except RuntimeError: msg = "The splitting on your task generated more than {0} jobs (the maximum).".format(maxJobs) raise TaskWorkerException(msg) if numJobs == 0: msg = "The CRAB3 server backend could not submit any job to the Grid scheduler:" msg += " splitting task %s" % (kwargs['task']['tm_taskname']) if kwargs['task']['tm_input_dataset']: msg += " on dataset %s" % (kwargs['task']['tm_input_dataset']) msg += " with %s method does not generate any job. See\n" % (kwargs['task']['tm_split_algo']) msg += "https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3FAQ#crab_submit_fails_with_Splitting" raise TaskWorkerException(msg) elif numJobs > maxJobs: raise TaskWorkerException("The splitting on your task generated %s jobs. The maximum number of jobs in each task is %s" % (numJobs, maxJobs)) minRuntime = getattr(self.config.TaskWorker, 'minAutomaticRuntimeMins', 180) if kwargs['task']['tm_split_algo'] == 'Automatic' and \ kwargs['task']['tm_split_args']['minutes_per_job'] < minRuntime: msg = "Minimum runtime requirement for automatic splitting is {0} minutes.".format(minRuntime) raise TaskWorkerException(msg) #printing duplicated lumis if any lumiChecker = getattr(jobfactory, 'lumiChecker', None) if lumiChecker and lumiChecker.splitLumiFiles: self.logger.warning("The input dataset contains the following duplicated lumis %s", lumiChecker.splitLumiFiles.keys()) msg = "The CRAB3 server backend detected lumis split across files in the input dataset." msg += " Will apply the necessary corrections in the splitting algorithm. You can ignore this message." self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname']) return Result(task = kwargs['task'], result = (factory, args[0]))
def jobSplittingByRun(self): """ """ self.checkUserSettings() blockSites = self.args['blockSites'] pubdata = self.args['pubdata'] if self.selectNumberOfJobs == 0: self.theNumberOfJobs = 9999999 blocks = {} runList = [] thefiles = Fileset(name='FilesToSplit') fileList = pubdata.getListFiles() for f in fileList: block = f['Block']['Name'] try: f['Block']['StorageElementList'].extend(blockSites[block]) except: continue wmbsFile = File(f['LogicalFileName']) if not blockSites[block]: msg = 'WARNING: No sites are hosting any part of data for block: %s\n' % block msg += 'Related jobs will not be submitted and this block of data can not be analyzed' common.logger.debug(msg) [wmbsFile['locations'].add(x) for x in blockSites[block]] wmbsFile['block'] = block runNum = f['RunsList'][0]['RunNumber'] runList.append(runNum) myRun = Run(runNumber=runNum) wmbsFile.addRun(myRun) thefiles.addFile(wmbsFile) work = Workflow() subs = Subscription(fileset=thefiles, workflow=work, split_algo='RunBased', type="Processing") splitter = SplitterFactory() jobfactory = splitter(subs) #loop over all runs list_of_lists = [] jobDestination = [] list_of_blocks = [] count = 0 for jobGroup in jobfactory(): if count < self.theNumberOfJobs: res = self.getJobInfo(jobGroup) parString = '' for file in res['lfns']: parString += file + ',' list_of_blocks.append(res['block']) fullString = parString[:-1] blockString = ','.join(list_of_blocks) list_of_lists.append( [fullString, str(-1), str(0), blockString]) #need to check single file location jobDestination.append(res['locations']) count += 1 # prepare dict output dictOut = {} dictOut['params'] = [ 'InputFiles', 'MaxEvents', 'SkipEvents', 'InputBlocks' ] dictOut['args'] = list_of_lists dictOut['jobDestination'] = jobDestination dictOut['njobs'] = count self.cacheBlocks(list_of_blocks, jobDestination) return dictOut
def jobSplittingByLumi(self): """ Split task into jobs by Lumi section paying attention to which lumis should be run (according to the analysis dataset). This uses WMBS job splitting which does not split files over jobs so the job will have AT LEAST as many lumis as requested, perhaps more """ self.useParent = int(self.cfg_params.get('CMSSW.use_parent', 0)) common.logger.debug('Splitting by Lumi') self.checkLumiSettings() blockSites = self.args['blockSites'] pubdata = self.args['pubdata'] lumisPerFile = pubdata.getLumis() self.parentFiles = pubdata.getParent() # Make the list of WMBS files for job splitter fileList = pubdata.getListFiles() wmFileList = [] for jobFile in fileList: block = jobFile['Block']['Name'] try: jobFile['Block']['StorageElementList'].extend( blockSites[block]) except: continue wmbsFile = File(jobFile['LogicalFileName']) if not blockSites[block]: msg = 'WARNING: No sites are hosting any part of data for block: %s\n' % block msg += 'Related jobs will not be submitted and this block of data can not be analyzed' common.logger.debug(msg) # wmbsFile['locations'].add('Nowhere') [wmbsFile['locations'].add(x) for x in blockSites[block]] wmbsFile['block'] = block for lumi in lumisPerFile[jobFile['LogicalFileName']]: wmbsFile.addRun(Run(lumi[0], lumi[1])) wmFileList.append(wmbsFile) fileSet = set(wmFileList) thefiles = Fileset(name='FilesToSplit', files=fileSet) # Create the factory and workflow work = Workflow() subs = Subscription(fileset=thefiles, workflow=work, split_algo='LumiBased', type="Processing") splitter = SplitterFactory() jobFactory = splitter(subs) list_of_lists = [] jobDestination = [] jobCount = 0 lumisCreated = 0 list_of_blocks = [] if not self.limitJobLumis: if self.totalNLumis > 0: self.lumisPerJob = max( self.totalNLumis // self.theNumberOfJobs, 1) else: self.lumisPerJob = pubdata.getMaxLumis( ) // self.theNumberOfJobs + 1 common.logger.info('Each job will process about %s lumis.' % self.lumisPerJob) for jobGroup in jobFactory(lumis_per_job=self.lumisPerJob): for job in jobGroup.jobs: if (self.limitNJobs and jobCount >= self.theNumberOfJobs): common.logger.info('Requested number of jobs reached.') break if (self.limitTotalLumis and lumisCreated >= self.totalNLumis): common.logger.info('Requested number of lumis reached.') break lumis = [] lfns = [] if self.useParent == 1: parentlfns = [] pString = "" locations = [] blocks = [] firstFile = True # Collect information from all the files for jobFile in job.getFiles(): doFile = False if firstFile: # Get locations from first file in the job for loc in jobFile['locations']: locations.append(loc) blocks.append(jobFile['block']) firstFile = False # Accumulate Lumis from all files for lumiList in jobFile['runs']: theRun = lumiList.run for theLumi in list(lumiList): if (not self.limitTotalLumis) or \ (lumisCreated < self.totalNLumis): doFile = True lumisCreated += 1 lumis.append((theRun, theLumi)) if doFile: lfns.append(jobFile['lfn']) if self.useParent == 1: parent = self.parentFiles[jobFile['lfn']] for p in parent: pString += p + ',' fileString = ','.join(lfns) lumiLister = LumiList(lumis=lumis) lumiString = lumiLister.getCMSSWString() blockString = ','.join(blocks) if self.useParent == 1: common.logger.debug("Files: " + fileString + " with the following parents: " + pString[:-1]) pfileString = pString[:-1] list_of_lists.append([ fileString, pfileString, str(-1), str(0), lumiString, blockString ]) else: list_of_lists.append( [fileString, str(-1), str(0), lumiString, blockString]) list_of_blocks.append(blocks) jobDestination.append(locations) jobCount += 1 common.logger.debug( 'Job %s will run on %s files and %s lumis ' % (jobCount, len(lfns), len(lumis))) common.logger.info('%s jobs created to run on %s lumis' % (jobCount, lumisCreated)) # Prepare dict output matching back to non-WMBS job creation dictOut = {} dictOut['params'] = [ 'InputFiles', 'MaxEvents', 'SkipEvents', 'Lumis', 'InputBlocks' ] if self.useParent == 1: dictOut['params'] = [ 'InputFiles', 'ParentFiles', 'MaxEvents', 'SkipEvents', 'Lumis', 'InputBlocks' ] dictOut['args'] = list_of_lists dictOut['jobDestination'] = jobDestination dictOut['njobs'] = jobCount self.cacheBlocks(list_of_blocks, jobDestination) return dictOut
def execute(self, *args, **kwargs): wmwork = Workflow(name=kwargs['task']['tm_taskname']) wmsubs = Subscription( fileset=args[0], workflow=wmwork, split_algo=kwargs['task']['tm_split_algo'], type=self.jobtypeMapper[kwargs['task']['tm_job_type']]) splitter = SplitterFactory() jobfactory = splitter(subscription=wmsubs) splitparam = kwargs['task']['tm_split_args'] splitparam['algorithm'] = kwargs['task']['tm_split_algo'] if kwargs['task']['tm_job_type'] == 'Analysis': if kwargs['task']['tm_split_algo'] == 'FileBased': splitparam['total_files'] = kwargs['task']['tm_totalunits'] elif kwargs['task']['tm_split_algo'] == 'LumiBased': splitparam['total_lumis'] = kwargs['task']['tm_totalunits'] elif kwargs['task']['tm_split_algo'] == 'EventAwareLumiBased': splitparam['total_events'] = kwargs['task']['tm_totalunits'] elif kwargs['task']['tm_job_type'] == 'PrivateMC': if 'tm_events_per_lumi' in kwargs['task'] and kwargs['task'][ 'tm_events_per_lumi']: splitparam['events_per_lumi'] = kwargs['task'][ 'tm_events_per_lumi'] if 'tm_generator' in kwargs['task'] and kwargs['task'][ 'tm_generator'] == 'lhe': splitparam['lheInputFiles'] = True splitparam['applyLumiCorrection'] = True factory = jobfactory(**splitparam) numJobs = sum([len(jobgroup.getJobs()) for jobgroup in factory]) maxJobs = getattr(self.config.TaskWorker, 'maxJobsPerTask', 10000) if numJobs == 0: msg = "The CRAB3 server backend could not submit any job to the Grid scheduler:" msg += " Splitting task %s" % (kwargs['task']['tm_taskname']) if kwargs['task']['tm_input_dataset']: msg += " on dataset %s" % (kwargs['task']['tm_input_dataset']) msg += " with %s method does not generate any job" % ( kwargs['task']['tm_split_algo']) raise TaskWorkerException(msg) elif numJobs > maxJobs: raise TaskWorkerException( "The splitting on your task generated %s jobs. The maximum number of jobs in each task is %s" % (numJobs, maxJobs)) #printing duplicated lumis if any lumiChecker = getattr(jobfactory, 'lumiChecker', None) if lumiChecker and lumiChecker.splitLumiFiles: self.logger.warning( "The input dataset contains the following duplicated lumis %s" % lumiChecker.splitLumiFiles.keys()) #TODO use self.uploadWarning try: userServer = HTTPRequests(self.server['host'], kwargs['task']['user_proxy'], kwargs['task']['user_proxy']) configreq = { 'subresource': 'addwarning', 'workflow': kwargs['task']['tm_taskname'], 'warning': b64encode( 'The CRAB3 server backend detected lumis split across files in the input dataset.' ' Will apply the necessary corrections in the splitting algorithms. You can ignore this message.' ) } userServer.post(self.restURInoAPI + '/task', data=urllib.urlencode(configreq)) except HTTPException as hte: self.logger.error(hte.headers) self.logger.warning( "Cannot add warning to REST after finding duplicates") return Result(task=kwargs['task'], result=factory)