def testD_Timing(self): """ _Timing_ This is to see how fast things go. """ myThread = threading.currentThread() name = makeUUID() config = self.getConfig() jobList = self.createGiantJobSet(name=name, config=config, nSubs=10, nJobs=1000, nFiles=10) testTaskArchiver = TaskArchiverPoller(config=config) startTime = time.time() testTaskArchiver.algorithm() stopTime = time.time() result = myThread.dbi.processData( "SELECT * FROM wmbs_job")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData( "SELECT * FROM wmbs_subscription")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData( "SELECT * FROM wmbs_jobgroup")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData( "SELECT * FROM wmbs_file_details")[0].fetchall() self.assertEqual(len(result), 0) testWMBSFileset = Fileset(id=1) self.assertEqual(testWMBSFileset.exists(), False) logging.info("TaskArchiver took %f seconds", (stopTime - startTime))
def testA_StraightThrough(self): """ _StraightThrough_ Just run everything straight through without any variations """ # Do pre-submit job check nRunning = getCondorRunningJobs() self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) myThread = threading.currentThread() workload = self.createTestWorkload() config = self.getConfig() name = 'WMAgent_Test1' site = self.sites[0] nSubs = 5 nFiles = 10 workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') # Create a collection of files self.createFileCollection(name = name, nSubs = nSubs, nFiles = nFiles, workflowURL = workloadPath, site = site) ############################################################ # Test the JobCreator config.Agent.componentName = 'JobCreator' testJobCreator = JobCreatorPoller(config = config) testJobCreator.algorithm() time.sleep(5) # Did all jobs get created? getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs") result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), nSubs*nFiles) # Count database objects result = myThread.dbi.processData('SELECT * FROM wmbs_sub_files_acquired')[0].fetchall() self.assertEqual(len(result), nSubs * nFiles) # Find the test directory testDirectory = os.path.join(self.testDir, 'TestWorkload', 'ReReco') self.assertTrue('JobCollection_1_0' in os.listdir(testDirectory)) self.assertTrue(len(os.listdir(testDirectory)) <= 20) groupDirectory = os.path.join(testDirectory, 'JobCollection_1_0') # First job should be in here self.assertTrue('job_1' in os.listdir(groupDirectory)) jobFile = os.path.join(groupDirectory, 'job_1', 'job.pkl') self.assertTrue(os.path.isfile(jobFile)) f = open(jobFile, 'r') job = cPickle.load(f) f.close() self.assertEqual(job['workflow'], name) self.assertEqual(len(job['input_files']), 1) self.assertEqual(os.path.basename(job['sandbox']), 'TestWorkload-Sandbox.tar.bz2') ############################################################### # Now test the JobSubmitter config.Agent.componentName = 'JobSubmitter' testJobSubmitter = JobSubmitterPoller(config = config) testJobSubmitter.algorithm() # Check that jobs are in the right state result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nFiles) # Check assigned locations getLocationAction = self.daoFactory(classname = "Jobs.GetLocation") for id in result: loc = getLocationAction.execute(jobid = id) self.assertEqual(loc, [[site]]) # Check to make sure we have running jobs nRunning = getCondorRunningJobs() self.assertEqual(nRunning, nFiles * nSubs) ################################################################# # Now the JobTracker config.Agent.componentName = 'JobTracker' testJobTracker = JobTrackerPoller(config = config) testJobTracker.setup() testJobTracker.algorithm() # Running the algo without removing the jobs should do nothing result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nFiles) condorRM() time.sleep(1) # All jobs gone? nRunning = getCondorRunningJobs() self.assertEqual(nRunning, 0) testJobTracker.algorithm() time.sleep(5) # Running the algo without removing the jobs should do nothing result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state = 'Complete', jobType = "Processing") self.assertEqual(len(result), nSubs * nFiles) ################################################################# # Now the JobAccountant # First you need to load all jobs self.getFWJRAction = self.daoFactory(classname = "Jobs.GetFWJRByState") completeJobs = self.getFWJRAction.execute(state = "complete") # Create reports for all jobs self.createReports(jobs = completeJobs, retryCount = 0) config.Agent.componentName = 'JobAccountant' testJobAccountant = JobAccountantPoller(config = config) testJobAccountant.setup() # It should do something with the jobs testJobAccountant.algorithm() # All the jobs should be done now result = getJobsAction.execute(state = 'Complete', jobType = "Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state = 'Success', jobType = "Processing") self.assertEqual(len(result), nSubs * nFiles) ####################################################################### # Now the JobArchiver config.Agent.componentName = 'JobArchiver' testJobArchiver = JobArchiverPoller(config = config) testJobArchiver.algorithm() # All the jobs should be cleaned up result = getJobsAction.execute(state = 'Success', jobType = "Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state = 'Cleanout', jobType = "Processing") self.assertEqual(len(result), nSubs * nFiles) logDir = os.path.join(self.testDir, 'logs') for job in completeJobs: self.assertFalse(os.path.exists(job['fwjr_path'])) jobFolder = 'JobCluster_%i' \ % (int(job['id']/config.JobArchiver.numberOfJobsToCluster)) jobPath = os.path.join(logDir, jobFolder, 'Job_%i.tar' %(job['id'])) self.assertTrue(os.path.isfile(jobPath)) self.assertTrue(os.path.getsize(jobPath) > 0) ########################################################################### # Now the TaskAchiver config.Agent.componentName = 'TaskArchiver' testTaskArchiver = TaskArchiverPoller(config = config) testTaskArchiver.algorithm() result = getJobsAction.execute(state = 'Cleanout', jobType = "Processing") self.assertEqual(len(result), 0) for jdict in completeJobs: job = Job(id = jdict['id']) self.assertFalse(job.exists()) if os.path.isdir('testDir'): shutil.rmtree('testDir') shutil.copytree('%s' %self.testDir, os.path.join(os.getcwd(), 'testDir')) return
def testE_multicore(self): """ _multicore_ Create a workload summary based on the multicore job report """ myThread = threading.currentThread() config = self.getConfig() workloadPath = os.path.join(self.testDir, 'specDir', 'spec.pkl') workload = self.createWorkload(workloadName=workloadPath) testJobGroup = self.createTestJobGroup(config=config, name=workload.name(), specLocation=workloadPath, error=False, multicore=True) cachePath = os.path.join(config.JobCreator.jobCacheDir, "TestWorkload", "ReReco") os.makedirs(cachePath) self.assertTrue(os.path.exists(cachePath)) dbname = config.TaskArchiver.workloadSummaryCouchDBName couchdb = CouchServer(config.JobStateMachine.couchurl) workdatabase = couchdb.connectDatabase(dbname) testTaskArchiver = TaskArchiverPoller(config=config) testTaskArchiver.algorithm() result = myThread.dbi.processData( "SELECT * FROM wmbs_job")[0].fetchall() self.assertEqual(len(result), 0, "No job should have survived") result = myThread.dbi.processData( "SELECT * FROM wmbs_subscription")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData( "SELECT * FROM wmbs_jobgroup")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData( "SELECT * FROM wmbs_file_details")[0].fetchall() self.assertEqual(len(result), 0) workloadSummary = workdatabase.document(id="TestWorkload") self.assertAlmostEquals( workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1'] ['minMergeTime']['average'], 5.7624950408900002, places=2) self.assertAlmostEquals( workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1'] ['numberOfMerges']['average'], 3.0, places=2) self.assertAlmostEquals( workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1'] ['averageProcessTime']['average'], 29.369966666700002, places=2) return
def testB_testErrors(self): """ _testErrors_ Test with a failed FWJR """ myThread = threading.currentThread() config = self.getConfig() workloadPath = os.path.join(self.testDir, 'specDir', 'spec.pkl') workload = self.createWorkload(workloadName=workloadPath) testJobGroup = self.createTestJobGroup(config=config, name=workload.name(), specLocation=workloadPath, error=True) cachePath = os.path.join(config.JobCreator.jobCacheDir, "TestWorkload", "ReReco") os.makedirs(cachePath) self.assertTrue(os.path.exists(cachePath)) testTaskArchiver = TaskArchiverPoller(config=config) testTaskArchiver.algorithm() dbname = getattr(config.JobStateMachine, "couchDBName") couchdb = CouchServer(config.JobStateMachine.couchurl) workdatabase = couchdb.connectDatabase("%s/workloadsummary" % dbname) workloadSummary = workdatabase.document(id=workload.name()) self.assertEqual( workloadSummary['errors']['/TestWorkload/ReReco']['failureTime'], 500) self.assertTrue(workloadSummary['errors']['/TestWorkload/ReReco'] ['cmsRun1'].has_key('99999')) self.assertEquals( workloadSummary['errors']['/TestWorkload/ReReco']['cmsRun1'] ['99999']['runs'], {'10': [12312]}, "Wrong lumi information in the summary for failed jobs") # Check the failures by site histograms self.assertEqual( workloadSummary['histograms']['workflowLevel']['failuresBySite'] ['data']['T1_IT_CNAF']['Failed Jobs'], 10) self.assertEqual( workloadSummary['histograms']['stepLevel']['/TestWorkload/ReReco'] ['cmsRun1']['errorsBySite']['data']['T1_IT_CNAF']['99999'], 10) self.assertEqual( workloadSummary['histograms']['stepLevel']['/TestWorkload/ReReco'] ['cmsRun1']['errorsBySite']['data']['T1_IT_CNAF']['8020'], 10) self.assertEqual( workloadSummary['histograms']['workflowLevel']['failuresBySite'] ['average']['Failed Jobs'], 10) self.assertEqual( workloadSummary['histograms']['stepLevel']['/TestWorkload/ReReco'] ['cmsRun1']['errorsBySite']['average']['99999'], 10) self.assertEqual( workloadSummary['histograms']['stepLevel']['/TestWorkload/ReReco'] ['cmsRun1']['errorsBySite']['average']['8020'], 10) self.assertEqual( workloadSummary['histograms']['workflowLevel']['failuresBySite'] ['stdDev']['Failed Jobs'], 0) self.assertEqual( workloadSummary['histograms']['stepLevel']['/TestWorkload/ReReco'] ['cmsRun1']['errorsBySite']['stdDev']['99999'], 0) self.assertEqual( workloadSummary['histograms']['stepLevel']['/TestWorkload/ReReco'] ['cmsRun1']['errorsBySite']['stdDev']['8020'], 0) return
def testA_BasicFunctionTest(self): """ _BasicFunctionTest_ Tests the components, by seeing if they can process a simple set of closeouts """ myThread = threading.currentThread() config = self.getConfig() workloadPath = os.path.join(self.testDir, 'specDir', 'spec.pkl') workload = self.createWorkload(workloadName=workloadPath) testJobGroup = self.createTestJobGroup(config=config, name=workload.name(), specLocation=workloadPath, error=False) # Create second workload testJobGroup2 = self.createTestJobGroup( config=config, name=workload.name(), filesetName="TestFileset_2", specLocation=workloadPath, task="/TestWorkload/ReReco/LogCollect") cachePath = os.path.join(config.JobCreator.jobCacheDir, "TestWorkload", "ReReco") os.makedirs(cachePath) self.assertTrue(os.path.exists(cachePath)) cachePath2 = os.path.join(config.JobCreator.jobCacheDir, "TestWorkload", "LogCollect") os.makedirs(cachePath2) self.assertTrue(os.path.exists(cachePath2)) result = myThread.dbi.processData( "SELECT * FROM wmbs_subscription")[0].fetchall() self.assertEqual(len(result), 2) workflowName = "TestWorkload" dbname = config.TaskArchiver.workloadSummaryCouchDBName couchdb = CouchServer(config.JobStateMachine.couchurl) workdatabase = couchdb.connectDatabase(dbname) jobdb = couchdb.connectDatabase("%s/jobs" % self.databaseName) fwjrdb = couchdb.connectDatabase("%s/fwjrs" % self.databaseName) jobs = jobdb.loadView("JobDump", "jobsByWorkflowName", options={ "startkey": [workflowName], "endkey": [workflowName, {}] })['rows'] self.assertEqual(len(jobs), 2 * self.nJobs) from WMCore.WMBS.CreateWMBSBase import CreateWMBSBase create = CreateWMBSBase() tables = [] for x in create.requiredTables: tables.append(x[2:]) testTaskArchiver = TaskArchiverPoller(config=config) testTaskArchiver.algorithm() result = myThread.dbi.processData( "SELECT * FROM wmbs_job")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData( "SELECT * FROM wmbs_subscription")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData( "SELECT * FROM wmbs_jobgroup")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData( "SELECT * FROM wmbs_fileset")[0].fetchall() self.assertEqual(len(result), 0) result = myThread.dbi.processData( "SELECT * FROM wmbs_file_details")[0].fetchall() self.assertEqual(len(result), 0) # Make sure we deleted the directory self.assertFalse(os.path.exists(cachePath)) self.assertFalse( os.path.exists( os.path.join(self.testDir, 'workloadTest/TestWorkload'))) testWMBSFileset = Fileset(id=1) self.assertEqual(testWMBSFileset.exists(), False) workloadSummary = workdatabase.document(id="TestWorkload") # Check ACDC self.assertEqual(workloadSummary['ACDCServer'], sanitizeURL(config.ACDC.couchurl)['url']) # Check the output self.assertEqual(workloadSummary['output'].keys(), ['/Electron/MorePenguins-v0/RECO']) self.assertEqual( sorted(workloadSummary['output']['/Electron/MorePenguins-v0/RECO'] ['tasks']), ['/TestWorkload/ReReco', '/TestWorkload/ReReco/LogCollect']) # Check performance # Check histograms self.assertAlmostEquals( workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1'] ['AvgEventTime']['histogram'][0]['average'], 0.89405199999999996, places=2) self.assertEqual( workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1'] ['AvgEventTime']['histogram'][0]['nEvents'], 10) # Check standard performance self.assertAlmostEquals( workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1'] ['TotalJobCPU']['average'], 17.786300000000001, places=2) self.assertAlmostEquals( workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1'] ['TotalJobCPU']['stdDev'], 0.0, places=2) # Check worstOffenders self.assertEqual( workloadSummary['performance']['/TestWorkload/ReReco']['cmsRun1'] ['AvgEventTime']['worstOffenders'], [{ 'logCollect': None, 'log': None, 'value': '0.894052', 'jobID': 1 }, { 'logCollect': None, 'log': None, 'value': '0.894052', 'jobID': 1 }, { 'logCollect': None, 'log': None, 'value': '0.894052', 'jobID': 2 }]) # Check retryData self.assertEqual(workloadSummary['retryData']['/TestWorkload/ReReco'], {'1': 10}) logCollectPFN = 'srm://srm-cms.cern.ch:8443/srm/managerv2?SFN=/castor/cern.ch/cms/store/logs/prod/2012/11/WMAgent/Run206446-MinimumBias-Run2012D-v1-Tier1PromptReco-4af7e658-23a4-11e2-96c7-842b2b4671d8/Run206446-MinimumBias-Run2012D-v1-Tier1PromptReco-4af7e658-23a4-11e2-96c7-842b2b4671d8-AlcaSkimLogCollect-1-logs.tar' self.assertEqual(workloadSummary['logArchives'], { '/TestWorkload/ReReco/LogCollect': [logCollectPFN for _ in range(10)] }) # LogCollect task is made out of identical FWJRs # assert that it is identical for x in workloadSummary['performance'][ '/TestWorkload/ReReco/LogCollect']['cmsRun1'].keys(): if x in config.TaskArchiver.histogramKeys: continue for y in ['average', 'stdDev']: self.assertAlmostEquals( workloadSummary['performance'] ['/TestWorkload/ReReco/LogCollect']['cmsRun1'][x][y], workloadSummary['performance']['/TestWorkload/ReReco'] ['cmsRun1'][x][y], places=2) return
def testB_testErrors(self): """ _testErrors_ Test with a failed FWJR """ config = self.getConfig() workloadPath = os.path.join(self.testDir, 'specDir', 'spec.pkl') workload = self.createWorkload(workloadName=workloadPath) testJobGroup = self.createTestJobGroup(config=config, name=workload.name(), specLocation=workloadPath, error=True) # Create second workload testJobGroup2 = self.createTestJobGroup( config=config, name=workload.name(), filesetName="TestFileset_2", specLocation=workloadPath, task="/TestWorkload/ReReco/LogCollect", jobType="LogCollect") cachePath = os.path.join(config.JobCreator.jobCacheDir, "TestWorkload", "ReReco") os.makedirs(cachePath) self.assertTrue(os.path.exists(cachePath)) couchdb = CouchServer(config.JobStateMachine.couchurl) jobdb = couchdb.connectDatabase("%s/jobs" % self.databaseName) fwjrdb = couchdb.connectDatabase("%s/fwjrs" % self.databaseName) jobdb.loadView("JobDump", "jobsByWorkflowName", options={ "startkey": [workload.name()], "endkey": [workload.name(), {}] })['rows'] fwjrdb.loadView("FWJRDump", "fwjrsByWorkflowName", options={ "startkey": [workload.name()], "endkey": [workload.name(), {}] })['rows'] self.populateWorkflowWithCompleteStatus() testTaskArchiver = TaskArchiverPoller(config=config) testTaskArchiver.algorithm() cleanCouch = CleanCouchPoller(config=config) cleanCouch.setup() cleanCouch.algorithm() dbname = getattr(config.JobStateMachine, "couchDBName") workdatabase = couchdb.connectDatabase("%s/workloadsummary" % dbname) workloadSummary = workdatabase.document(id=workload.name()) self.assertEqual( workloadSummary['errors']['/TestWorkload/ReReco']['failureTime'], 500) self.assertTrue('99999' in workloadSummary['errors'] ['/TestWorkload/ReReco']['cmsRun1']) failedRunInfo = workloadSummary['errors']['/TestWorkload/ReReco'][ 'cmsRun1']['99999']['runs'] self.assertEqual( failedRunInfo, {'10': [[12312, 12312]]}, "Wrong lumi information in the summary for failed jobs") # Check the failures by site histograms self.assertEqual( workloadSummary['histograms']['workflowLevel']['failuresBySite'] ['data']['T1_IT_CNAF']['Failed Jobs'], 10) self.assertEqual( workloadSummary['histograms']['stepLevel']['/TestWorkload/ReReco'] ['cmsRun1']['errorsBySite']['data']['T1_IT_CNAF']['99999'], 10) self.assertEqual( workloadSummary['histograms']['stepLevel']['/TestWorkload/ReReco'] ['cmsRun1']['errorsBySite']['data']['T1_IT_CNAF']['8020'], 10) self.assertEqual( workloadSummary['histograms']['workflowLevel']['failuresBySite'] ['average']['Failed Jobs'], 10) self.assertEqual( workloadSummary['histograms']['stepLevel']['/TestWorkload/ReReco'] ['cmsRun1']['errorsBySite']['average']['99999'], 10) self.assertEqual( workloadSummary['histograms']['stepLevel']['/TestWorkload/ReReco'] ['cmsRun1']['errorsBySite']['average']['8020'], 10) self.assertEqual( workloadSummary['histograms']['workflowLevel']['failuresBySite'] ['stdDev']['Failed Jobs'], 0) self.assertEqual( workloadSummary['histograms']['stepLevel']['/TestWorkload/ReReco'] ['cmsRun1']['errorsBySite']['stdDev']['99999'], 0) self.assertEqual( workloadSummary['histograms']['stepLevel']['/TestWorkload/ReReco'] ['cmsRun1']['errorsBySite']['stdDev']['8020'], 0) return