def setUp(self): """ _setUp_ Standard setup: Now with 100% more couch """ self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules=[ "WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database" ]) self.testInit.setupCouch("jobsubmitter_t/jobs", "JobDump") self.testInit.setupCouch("jobsubmitter_t/fwjrs", "FWJRDump") self.testInit.setupCouch("wmagent_summary_t", "WMStats") myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.baDaoFactory = DAOFactory(package="WMCore.BossAir", logger=myThread.logger, dbinterface=myThread.dbi) self.testDir = self.testInit.generateWorkDir() # Set heartbeat self.componentName = 'JobSubmitter' self.heartbeatAPI = HeartbeatAPI(self.componentName) self.heartbeatAPI.registerComponent() self.configFile = EmulatorSetup.setupWMAgentConfig() return
def setUp(self): """ _setUp_ Setup the database and logging connection. Try to create all of the WMBS tables. Also, create some dummy locations. """ super(JobCreatorTest, self).setUp() self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules=[ 'WMCore.WMBS', 'WMCore.ResourceControl', 'WMCore.Agent.Database' ], useDefault=False) self.couchdbname = "jobcreator_t" self.testInit.setupCouch("%s/jobs" % self.couchdbname, "JobDump") self.testInit.setupCouch("%s/fwjrs" % self.couchdbname, "FWJRDump") self.configFile = EmulatorSetup.setupWMAgentConfig() myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) locationAction = self.daoFactory(classname="Locations.New") for site in self.sites: locationAction.execute(siteName=site, pnn=site) # Create sites in resourceControl resourceControl = ResourceControl() for site in self.sites: resourceControl.insertSite(siteName=site, pnn=site, ceName=site) resourceControl.insertThreshold(siteName=site, taskType='Processing', maxSlots=10000, pendingSlots=10000) self.resourceControl = resourceControl self._setup = True self._teardown = False self.testDir = self.testInit.generateWorkDir() self.cwd = os.getcwd() # Set heartbeat self.componentName = 'JobCreator' self.heartbeatAPI = HeartbeatAPI(self.componentName) self.heartbeatAPI.registerComponent() if PY3: self.assertItemsEqual = self.assertCountEqual return
def setUp(self): """ setup for test. """ myThread = threading.currentThread() self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection(destroyAllDatabase=True) self.tearDown() self.testInit.setSchema( customModules=["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"], useDefault=False) self.testInit.setupCouch("bossair_t/jobs", "JobDump") self.testInit.setupCouch("bossair_t/fwjrs", "FWJRDump") self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") # Create sites in resourceControl resourceControl = ResourceControl() for site in self.sites: resourceControl.insertSite(siteName=site, pnn='%s_PNN' % site, cmsName=site, ceName=site, plugin="SimpleCondorPlugin", pendingSlots=1000, runningSlots=2000) resourceControl.insertThreshold(siteName=site, taskType='Processing', maxSlots=1000, pendingSlots=1000) site = 'T3_US_Xanadu' resourceControl.insertSite(siteName=site, pnn='%s_PNN' % site, cmsName=site, ceName=site, plugin="TestPlugin") resourceControl.insertThreshold(siteName=site, taskType='Processing', maxSlots=10000, pendingSlots=10000) # Create user newuser = self.daoFactory(classname="Users.New") newuser.execute(dn="tapas", group_name="phgroup", role_name="cmsrole") # We actually need the user name self.user = getpass.getuser() # Change this to the working dir to keep track of error and log files from condor self.testDir = self.testInit.generateWorkDir() # Set heartbeat componentName = 'test' self.heartbeatAPI = HeartbeatAPI(componentName) self.heartbeatAPI.registerComponent() componentName = 'JobTracker' self.heartbeatAPI2 = HeartbeatAPI(componentName) self.heartbeatAPI2.registerComponent() return
def setUp(self): """ _setUp_ setUp function for unittest """ # Set constants self.couchDB = "config_test" self.configURL = "RANDOM;;URL;;NAME" self.configString = "This is a random string" self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema( customModules=["WMComponent.DBS3Buffer", 'WMCore.Agent.Database'], useDefault=False) self.testInit.setupCouch(self.couchDB, "GroupUser", "ConfigCache") myThread = threading.currentThread() self.bufferFactory = DAOFactory( package="WMComponent.DBSBuffer.Database", logger=myThread.logger, dbinterface=myThread.dbi) self.buffer3Factory = DAOFactory(package="WMComponent.DBS3Buffer", logger=myThread.logger, dbinterface=myThread.dbi) locationAction = self.bufferFactory( classname="DBSBufferFiles.AddLocation") locationAction.execute(siteName="se1.cern.ch") locationAction.execute(siteName="se1.fnal.gov") locationAction.execute(siteName="malpaquet") # Set heartbeat self.componentName = 'JobSubmitter' self.heartbeatAPI = HeartbeatAPI(self.componentName) self.heartbeatAPI.registerComponent() # Set up a config cache configCache = ConfigCache(os.environ["COUCHURL"], couchDBName=self.couchDB) configCache.createUserGroup(groupname="testGroup", username='******') self.testDir = self.testInit.generateWorkDir() psetPath = os.path.join(self.testDir, "PSet.txt") f = open(psetPath, 'w') f.write(self.configString) f.close() configCache.addConfig(newConfig=psetPath, psetHash=None) configCache.save() self.configURL = "%s;;%s;;%s" % (os.environ["COUCHURL"], self.couchDB, configCache.getCouchID()) return
def setUp(self): """ _setUp_ Set up vital components """ self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules = ["WMCore.WMBS",'WMCore.MsgService', 'WMCore.ResourceControl', 'WMCore.ThreadPool', 'WMCore.Agent.Database'], useDefault = False) myThread = threading.currentThread() self.daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) locationAction = self.daoFactory(classname = "Locations.New") pendingSlots = self.daoFactory(classname = "Locations.SetPendingSlots") for site in self.sites: locationAction.execute(siteName = site, seName = 'se.%s' % (site), ceName = site) pendingSlots.execute(siteName = site, pendingSlots = 1000) #Create sites in resourceControl resourceControl = ResourceControl() for site in self.sites: resourceControl.insertSite(siteName = site, seName = 'se.%s' % (site), ceName = site) resourceControl.insertThreshold(siteName = site, taskType = 'Processing', \ maxSlots = 10000, pendingSlots = 10000) self.testDir = self.testInit.generateWorkDir() # Set heartbeat for component in self.components: heartbeatAPI = HeartbeatAPI(component) heartbeatAPI.registerComponent() return
def setUp(self): """ _setUp_ Standard setup: Now with 100% more couch """ self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules = ["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"]) self.testInit.setupCouch("jobsubmitter_t/jobs", "JobDump") self.testInit.setupCouch("jobsubmitter_t/fwjrs", "FWJRDump") self.testInit.setupCouch("wmagent_summary_t", "WMStats") myThread = threading.currentThread() self.daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) self.baDaoFactory = DAOFactory(package = "WMCore.BossAir", logger = myThread.logger, dbinterface = myThread.dbi) self.testDir = self.testInit.generateWorkDir() # Set heartbeat self.componentName = 'JobSubmitter' self.heartbeatAPI = HeartbeatAPI(self.componentName) self.heartbeatAPI.registerComponent() return
def setUp(self): """ _setUp_ Setup the database and logging connection. Try to create all of the WMBS tables. Also, create some dummy locations. """ myThread = threading.currentThread() self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() #self.tearDown() self.testInit.setSchema(customModules = ['WMCore.WMBS', 'WMCore.ResourceControl', 'WMCore.Agent.Database'], useDefault = False) self.couchdbname = "jobcreator_t" self.testInit.setupCouch("%s/jobs" % self.couchdbname, "JobDump") self.testInit.setupCouch("%s/fwjrs" % self.couchdbname, "FWJRDump") myThread = threading.currentThread() self.daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) locationAction = self.daoFactory(classname = "Locations.New") for site in self.sites: locationAction.execute(siteName = site, seName = site) #Create sites in resourceControl resourceControl = ResourceControl() for site in self.sites: resourceControl.insertSite(siteName = site, seName = site, ceName = site) resourceControl.insertThreshold(siteName = site, taskType = 'Processing', \ maxSlots = 10000, pendingSlots = 10000) self.resourceControl = resourceControl self._setup = True self._teardown = False self.testDir = self.testInit.generateWorkDir() self.cwd = os.getcwd() # Set heartbeat self.componentName = 'JobCreator' self.heartbeatAPI = HeartbeatAPI(self.componentName) self.heartbeatAPI.registerComponent() return
def prepareToStart(self): """ _prepareToStart_ returns: Nothing Starts the initialization procedure. It is mainly an aggregation method so it can easily used in tests. """ self.state = 'initialize' self.initInThread() # note: every component gets a (unique) name: # self.config.Agent.componentName logging.info(">>>Registering Component - %s", self.config.Agent.componentName) if getattr(self.config.Agent, "useHeartbeat", True): self.heartbeatAPI = HeartbeatAPI(self.config.Agent.componentName) self.heartbeatAPI.registerComponent() logging.info('>>>Starting initialization') logging.info('>>>Setting default transaction') myThread = threading.currentThread() self.preInitialization() if myThread.sql_transaction: myThread.transaction.begin() self.initialization() self.postInitialization() if myThread.sql_transaction: myThread.transaction.commit() logging.info('>>>Committing default transaction') logging.info(">>>Starting worker threads") myThread.workerThreadManager.resumeWorkers() logging.info(">>>Initialization finished!\n") # wait for messages self.state = 'active'
def setUp(self): """ _setUp_ setUp function for unittest """ # Set constants self.couchDB = "config_test" self.configURL = "RANDOM;;URL;;NAME" self.configString = "This is a random string" self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules = ["WMComponent.DBS3Buffer", 'WMCore.Agent.Database'], useDefault = False) self.testInit.setupCouch(self.couchDB, "GroupUser", "ConfigCache") myThread = threading.currentThread() self.bufferFactory = DAOFactory(package = "WMComponent.DBSBuffer.Database", logger = myThread.logger, dbinterface = myThread.dbi) locationAction = self.bufferFactory(classname = "DBSBufferFiles.AddLocation") locationAction.execute(siteName = "se1.cern.ch") locationAction.execute(siteName = "se1.fnal.gov") locationAction.execute(siteName = "malpaquet") # Set heartbeat self.componentName = 'JobSubmitter' self.heartbeatAPI = HeartbeatAPI(self.componentName) self.heartbeatAPI.registerComponent() # Set up a config cache configCache = ConfigCache(os.environ["COUCHURL"], couchDBName = self.couchDB) configCache.createUserGroup(groupname = "testGroup", username = '******') self.testDir = self.testInit.generateWorkDir() psetPath = os.path.join(self.testDir, "PSet.txt") f = open(psetPath, 'w') f.write(self.configString) f.close() configCache.addConfig(newConfig = psetPath, psetHash = None) configCache.save() self.configURL = "%s;;%s;;%s" % (os.environ["COUCHURL"], self.couchDB, configCache.getCouchID()) return
def __init__(self, slaveClassName, totalSlaves, componentDir, config, slaveInit = None, namespace = None): """ __init__ Constructor for the process pool. The slave class name must be based inside the WMComponent namespace. For examples, the JobAccountant would pass in 'JobAccountant.AccountantWorker' to run the AccountantWorker class. All log files will be stored in the component directory that is passed in. Each slave will have its own log file. Note that the config is only used to determine database connection parameters. It is not passed to the slave class. The slaveInit parameter will be serialized and passed to the slave class's constructor. """ self.enqueueIndex = 0 self.dequeueIndex = 0 self.runningWork = 0 #Use the Services.Requests JSONizer, which handles __to_json__ calls self.jsonHandler = JSONRequests() # heartbeat should be registered at this point if getattr(config.Agent, "useHeartbeat", True): self.heartbeatAPI = HeartbeatAPI(getattr(config.Agent, "componentName", "ProcPoolSlave")) self.slaveClassName = slaveClassName self.componentDir = componentDir self.config = config # Grab the python version from the current version # Assume naming convention pythonA.B, i.e., python2.4 for v2.4.X majorVersion = sys.version_info[0] minorVersion = sys.version_info[1] if majorVersion and minorVersion: self.versionString = "python%i.%i" % (majorVersion, minorVersion) else: self.versionString = "python2.4" self.workers = [] self.nSlaves = totalSlaves self.slaveInit = slaveInit self.namespace = namespace # Now actually create the slaves self.createSlaves() return
def setUp(self): """ _setUp_ Setup the database and logging connection. Try to create all of the Heartbeat tables. Also add some dummy locations. """ self.testInit = TestInit(__file__) self.testInit.setLogging() # logLevel = logging.SQLDEBUG self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules=["WMCore.Agent.Database"], useDefault=False) self.heartbeat = HeartbeatAPI("testComponent")
def setUp(self): """ Standard setup: Now with 100% more couch """ self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection(destroyAllDatabase=True) self.testInit.setSchema( customModules=["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"], useDefault=False, ) self.testInit.setupCouch("jobsubmitter_t/jobs", "JobDump") self.testInit.setupCouch("jobsubmitter_t/fwjrs", "FWJRDump") myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) locationAction = self.daoFactory(classname="Locations.New") locationSlots = self.daoFactory(classname="Locations.SetJobSlots") # We actually need the user name self.user = getpass.getuser() self.ceName = "127.0.0.1" # Create sites in resourceControl resourceControl = ResourceControl() for site in self.sites: resourceControl.insertSite( siteName=site, seName="se.%s" % (site), ceName=site, plugin="CondorPlugin", pendingSlots=10000, runningSlots=20000, cmsName=site, ) resourceControl.insertThreshold(siteName=site, taskType="Processing", maxSlots=10000) self.testDir = self.testInit.generateWorkDir() # Set heartbeat self.componentName = "JobSubmitter" self.heartbeatAPI = HeartbeatAPI(self.componentName) self.heartbeatAPI.registerComponent() return
def prepareToStart(self): """ _prepareToStart_ returns: Nothing Starts the initialization procedure. It is mainly an aggregation method so it can easily used in tests. """ self.state = 'initialize' self.initInThread() # note: every component gets a (unique) name: # self.config.Agent.componentName logging.info(">>>Registering Component - %s" % self.config.Agent.componentName) if getattr(self.config.Agent, "useHeartbeat", True): self.heartbeatAPI = HeartbeatAPI(self.config.Agent.componentName) self.heartbeatAPI.registerComponent() logging.info('>>>Starting initialization') logging.info('>>>Setting default transaction') myThread = threading.currentThread() self.preInitialization() if myThread.sql_transaction: myThread.transaction.begin() self.initialization() self.postInitialization() if myThread.sql_transaction: myThread.transaction.commit() logging.info('>>>Committing default transaction') logging.info(">>>Starting worker threads") myThread.workerThreadManager.resumeWorkers() logging.info(">>>Initialization finished!\n") # wait for messages self.state = 'active'
def prepareWorker(self, worker, idleTime): """ Prepares a worker thread before running """ # Work timing worker.idleTime = idleTime worker.component = self.component self.lock.acquire() self.slavecounter += 1 worker.slaveid = "%s-%s" % (self.wtmnumber, self.slavecounter) self.lock.release() # Thread synchronisation worker.notifyTerminate = self.terminateSlaves worker.terminateCallback = self.slaveTerminateCallback worker.notifyPause = self.pauseSlaves worker.notifyResume = self.resumeSlaves if hasattr(self.component.config, "Agent"): if getattr(self.component.config.Agent, "useHeartbeat", True): worker.heartbeatAPI = HeartbeatAPI(self.component.config.Agent.componentName)
def testHeartbeat(self): testComponent = HeartbeatAPI("testComponent") testComponent.pollInterval = 10 testComponent.registerComponent() self.assertEqual(testComponent.getHeartbeatInfo(), []) testComponent.updateWorkerHeartbeat("testWorker") result = testComponent.getHeartbeatInfo() self.assertEqual(len(result), 1) self.assertEqual(result[0]['worker_name'], "testWorker") time.sleep(1) testComponent.updateWorkerHeartbeat("testWorker2") result = testComponent.getHeartbeatInfo() self.assertEqual(len(result), 1) self.assertEqual(result[0]['worker_name'], "testWorker2") time.sleep(1) testComponent.updateWorkerHeartbeat("testWorker") result = testComponent.getHeartbeatInfo() self.assertEqual(len(result), 1) self.assertEqual(result[0]['worker_name'], "testWorker") testComponent = HeartbeatAPI("test2Component") testComponent.pollInterval = 20 testComponent.registerComponent() time.sleep(1) testComponent.updateWorkerHeartbeat("test2Worker") result = testComponent.getHeartbeatInfo() self.assertEqual(len(result), 2) self.assertEqual(result[0]['worker_name'], "testWorker") self.assertEqual(result[1]['worker_name'], "test2Worker") time.sleep(1) testComponent.updateWorkerHeartbeat("test2Worker2") result = testComponent.getHeartbeatInfo() self.assertEqual(len(result), 2) self.assertEqual(result[0]['worker_name'], "testWorker") self.assertEqual(result[1]['worker_name'], "test2Worker2") time.sleep(1) testComponent.updateWorkerHeartbeat("test2Worker") result = testComponent.getHeartbeatInfo() self.assertEqual(len(result), 2) self.assertEqual(result[0]['worker_name'], "testWorker") self.assertEqual(result[1]['worker_name'], "test2Worker") testComponent.updateWorkerError("test2Worker", "Error1") result = testComponent.getHeartbeatInfo() self.assertEqual(result[1]['error_message'], "Error1")
class ProcessPool: def __init__(self, slaveClassName, totalSlaves, componentDir, config, slaveInit = None, namespace = None): """ __init__ Constructor for the process pool. The slave class name must be based inside the WMComponent namespace. For examples, the JobAccountant would pass in 'JobAccountant.AccountantWorker' to run the AccountantWorker class. All log files will be stored in the component directory that is passed in. Each slave will have its own log file. Note that the config is only used to determine database connection parameters. It is not passed to the slave class. The slaveInit parameter will be serialized and passed to the slave class's constructor. """ self.enqueueIndex = 0 self.dequeueIndex = 0 self.runningWork = 0 #Use the Services.Requests JSONizer, which handles __to_json__ calls self.jsonHandler = JSONRequests() # heartbeat should be registered at this point if getattr(config.Agent, "useHeartbeat", True): self.heartbeatAPI = HeartbeatAPI(getattr(config.Agent, "componentName", "ProcPoolSlave")) self.slaveClassName = slaveClassName self.componentDir = componentDir self.config = config # Grab the python version from the current version # Assume naming convention pythonA.B, i.e., python2.4 for v2.4.X majorVersion = sys.version_info[0] minorVersion = sys.version_info[1] if majorVersion and minorVersion: self.versionString = "python%i.%i" % (majorVersion, minorVersion) else: self.versionString = "python2.4" self.workers = [] self.nSlaves = totalSlaves self.slaveInit = slaveInit self.namespace = namespace # Now actually create the slaves self.createSlaves() return def createSlaves(self): """ _createSlaves_ Create the slaves by using the values from __init__() Moving it into a separate function allows us to restart all of them. """ totalSlaves = self.nSlaves slaveClassName = self.slaveClassName config = self.config slaveInit = self.slaveInit namespace = self.namespace slaveArgs = [self.versionString, __file__, self.slaveClassName] if hasattr(config.CoreDatabase, "socket"): socket = config.CoreDatabase.socket else: socket = None (connectDialect, junk) = config.CoreDatabase.connectUrl.split(":", 1) if connectDialect.lower() == "mysql": dialect = "MySQL" elif connectDialect.lower() == "oracle": dialect = "Oracle" elif connectDialect.lower() == "sqlite": dialect = "SQLite" dbConfig = {"dialect": dialect, "connectUrl": config.CoreDatabase.connectUrl, "socket": socket, "componentDir": self.componentDir} if namespace: # Then add a namespace to the config dbConfig['namespace'] = namespace encodedDBConfig = self.jsonHandler.encode(dbConfig) if slaveInit == None: encodedSlaveInit = None else: encodedSlaveInit = self.jsonHandler.encode(slaveInit) count = 0 while totalSlaves > 0: #For each worker you want create a slave process #That process calls this code (WMCore.ProcessPool) and opens #A process pool that loads the designated class slaveProcess = subprocess.Popen(slaveArgs, stdin = subprocess.PIPE, stdout = subprocess.PIPE) slaveProcess.stdin.write("%s\n" % encodedDBConfig) if encodedSlaveInit == None: slaveProcess.stdin.write("\n") else: slaveProcess.stdin.write("%s\n" % encodedSlaveInit) slaveProcess.stdin.flush() self.workers.append(WorkerProcess(subproc = slaveProcess)) workerName = self._subProcessName(self.slaveClassName, count) if getattr(self.config.Agent, "useHeartbeat", True): self.heartbeatAPI.updateWorkerHeartbeat(workerName, pid = slaveProcess.pid) totalSlaves -= 1 count += 1 return def _subProcessName(self, slaveClassName, sequence): """ subProcessName for heartbeat could change to use process ID as a suffix """ return "%s_%s" % (slaveClassName, sequence + 1) def __del__(self): """ __del__ Kill all the workers processes by sending them an invalid JSON object. This will cause them to shut down. """ for worker in self.workers: try: worker.delete() except Exception, ex: pass self.workers = [] return
class DBSUploadTest(unittest.TestCase): """ TestCase for DBSUpload module Note: This fails if you use the in-memory syntax for sqlite i.e. (DATABASE = sqlite://) """ _maxMessage = 10 def setUp(self): """ _setUp_ setUp function for unittest """ # Set constants self.couchDB = "config_test" self.configURL = "RANDOM;;URL;;NAME" self.configString = "This is a random string" self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules=["WMComponent.DBS3Buffer", "WMCore.Agent.Database"], useDefault=False) self.testInit.setupCouch(self.couchDB, "GroupUser", "ConfigCache") myThread = threading.currentThread() self.bufferFactory = DAOFactory( package="WMComponent.DBSBuffer.Database", logger=myThread.logger, dbinterface=myThread.dbi ) locationAction = self.bufferFactory(classname="DBSBufferFiles.AddLocation") locationAction.execute(siteName="se1.cern.ch") locationAction.execute(siteName="se1.fnal.gov") locationAction.execute(siteName="malpaquet") # Set heartbeat self.componentName = "JobSubmitter" self.heartbeatAPI = HeartbeatAPI(self.componentName) self.heartbeatAPI.registerComponent() # Set up a config cache configCache = ConfigCache(os.environ["COUCHURL"], couchDBName=self.couchDB) configCache.createUserGroup(groupname="testGroup", username="******") self.testDir = self.testInit.generateWorkDir() psetPath = os.path.join(self.testDir, "PSet.txt") f = open(psetPath, "w") f.write(self.configString) f.close() configCache.addConfig(newConfig=psetPath, psetHash=None) configCache.save() self.configURL = "%s;;%s;;%s" % (os.environ["COUCHURL"], self.couchDB, configCache.getCouchID()) return def tearDown(self): """ _tearDown_ tearDown function for unittest """ self.testInit.clearDatabase(modules=["WMComponent.DBS3Buffer", "WMCore.Agent.Database"]) def createConfig(self): """ _createConfig_ This creates the actual config file used by the component """ config = Configuration() # First the general stuff config.section_("General") config.General.workDir = os.getenv("TESTDIR", os.getcwd()) config.section_("Agent") config.Agent.componentName = "DBSUpload" config.Agent.useHeartbeat = False # Now the CoreDatabase information # This should be the dialect, dburl, etc config.section_("CoreDatabase") config.CoreDatabase.connectUrl = os.getenv("DATABASE") config.CoreDatabase.socket = os.getenv("DBSOCK") config.component_("DBSUpload") config.DBSUpload.pollInterval = 10 config.DBSUpload.logLevel = "ERROR" config.DBSUpload.maxThreads = 1 config.DBSUpload.namespace = "WMComponent.DBSUpload.DBSUpload" config.DBSUpload.componentDir = os.path.join(os.getcwd(), "Components") config.DBSUpload.workerThreads = 4 config.section_("DBSInterface") config.DBSInterface.globalDBSUrl = "http://vocms09.cern.ch:8880/cms_dbs_int_local_xx_writer/servlet/DBSServlet" config.DBSInterface.globalDBSVersion = "DBS_2_0_9" config.DBSInterface.DBSUrl = "http://vocms09.cern.ch:8880/cms_dbs_int_local_yy_writer/servlet/DBSServlet" config.DBSInterface.DBSVersion = "DBS_2_0_9" config.DBSInterface.DBSBlockMaxFiles = 10 config.DBSInterface.DBSBlockMaxSize = 9999999999 config.DBSInterface.DBSBlockMaxTime = 10000 config.DBSInterface.MaxFilesToCommit = 10 # addition for Alerts messaging framework, work (alerts) and control # channel addresses to which the component will be sending alerts # these are destination addresses where AlertProcessor:Receiver listens config.section_("Alert") config.Alert.address = "tcp://127.0.0.1:5557" config.Alert.controlAddr = "tcp://127.0.0.1:5559" # configure threshold of DBS upload queue size alert threshold # reference: trac ticket #1628 config.DBSUpload.alertUploadQueueSize = 2000 return config def getFiles(self, name, tier, nFiles=12, site="malpaquet"): """ Create some quick dummy test files """ files = [] for f in range(0, nFiles): testFile = DBSBufferFile(lfn="%s-%s-%i" % (name, site, f), size=1024, events=20, checksums={"cksum": 1}) testFile.setAlgorithm( appName=name, appVer="CMSSW_3_1_1", appFam="RECO", psetHash="GIBBERISH", configContent=self.configURL ) testFile.setDatasetPath("/%s/%s/%s" % (name, name, tier)) testFile.addRun(Run(1, *[f])) testFile.setGlobalTag("aGlobalTag") testFile.create() testFile.setLocation(site) files.append(testFile) testFileChild = DBSBufferFile(lfn="%s-%s-child" % (name, site), size=1024, events=10, checksums={"cksum": 1}) testFileChild.setAlgorithm( appName=name, appVer="CMSSW_3_1_1", appFam="RECO", psetHash="GIBBERISH", configContent=self.configURL ) testFileChild.setDatasetPath("/%s/%s_2/RECO" % (name, name)) testFileChild.addRun(Run(1, *[45])) testFileChild.setGlobalTag("aGlobalTag") testFileChild.create() testFileChild.setLocation(site) testFileChild.addParents([x["lfn"] for x in files]) return files @attr("integration") def testA_basicUploadTest(self): """ _basicUploadTest_ Do everything simply once Create dataset, algo, files, blocks, upload them, mark as done, finish them, migrate them Also check the timeout """ myThread = threading.currentThread() config = self.createConfig() config.DBSInterface.DBSBlockMaxTime = 3 config.DBSUpload.pollInterval = 4 name = "ThisIsATest_%s" % (makeUUID()) tier = "RECO" nFiles = 12 files = self.getFiles(name=name, tier=tier, nFiles=nFiles) datasetPath = "/%s/%s/%s" % (name, name, tier) # Load components that are necessary to check status factory = WMFactory("dbsUpload", "WMComponent.DBSUpload.Database.Interface") dbinterface = factory.loadObject("UploadToDBS") dbsInterface = DBSInterface(config=config) localAPI = dbsInterface.getAPIRef() globeAPI = dbsInterface.getAPIRef(globalRef=True) # In the first round we should create blocks for the first dataset # The child dataset should not be handled until the parent is uploaded testDBSUpload = DBSUploadPoller(config=config) testDBSUpload.algorithm() # First, see if there are any blocks # One in DBS, one not in DBS result = myThread.dbi.processData("SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(len(result), 2) self.assertEqual(result, [("InGlobalDBS",), ("Open",)]) # Check to see if datasets and algos are in local DBS result = listAlgorithms(apiRef=localAPI, patternExe=name) self.assertEqual(len(result), 1) self.assertEqual(result[0]["ExecutableName"], name) result = listPrimaryDatasets(apiRef=localAPI, match=name) self.assertEqual(result, [name]) result = listProcessedDatasets(apiRef=localAPI, primary=name, dataTier="*") # Then check and see that the closed block made it into local DBS affectedBlocks = listBlocks(apiRef=localAPI, datasetPath=datasetPath) if affectedBlocks[0]["OpenForWriting"] == "0": self.assertEqual(affectedBlocks[1]["OpenForWriting"], "1") self.assertEqual(affectedBlocks[0]["NumberOfFiles"], 10) self.assertEqual(affectedBlocks[1]["NumberOfFiles"], 2) else: self.assertEqual(affectedBlocks[0]["OpenForWriting"], "1") self.assertEqual(affectedBlocks[1]["NumberOfFiles"], 10) self.assertEqual(affectedBlocks[0]["NumberOfFiles"], 2) # Check to make sure all the files are in local result = listDatasetFiles(apiRef=localAPI, datasetPath=datasetPath) fileLFNs = [x["lfn"] for x in files] for lfn in fileLFNs: self.assertTrue(lfn in result) # Make sure the child files aren't there flag = False try: listDatasetFiles(apiRef=localAPI, datasetPath="/%s/%s_2/%s" % (name, name, tier)) except Exception, ex: flag = True self.assertTrue(flag) # There should be one blocks in global # It should have ten files and be closed result = listBlocks(apiRef=globeAPI, datasetPath=datasetPath) self.assertEqual(len(result), 1) for block in result: self.assertEqual(block["OpenForWriting"], "0") self.assertTrue(block["NumberOfFiles"] in [2, 10]) # Okay, deep breath. First round done # In the second round, the second block of the parent fileset should transfer # Make sure that the timeout functions work time.sleep(10) testDBSUpload.algorithm() result = myThread.dbi.processData("SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(len(result), 2) self.assertEqual(result, [("InGlobalDBS",), ("InGlobalDBS",)]) # Check to make sure all the files are in global result = listDatasetFiles(apiRef=globeAPI, datasetPath=datasetPath) for lfn in fileLFNs: self.assertTrue(lfn in result) # Make sure the child files aren't there flag = False try: listDatasetFiles(apiRef=localAPI, datasetPath="/%s/%s_2/%s" % (name, name, tier)) except Exception, ex: flag = True
class DBSUploadTest(unittest.TestCase): """ _DBSUploadTest_ TestCase for DBSUpload module """ _maxMessage = 10 def setUp(self): """ _setUp_ setUp function for unittest """ # Set constants self.couchDB = "config_test" self.configURL = "RANDOM;;URL;;NAME" self.configString = "This is a random string" self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema( customModules=["WMComponent.DBS3Buffer", 'WMCore.Agent.Database'], useDefault=False) self.testInit.setupCouch(self.couchDB, "GroupUser", "ConfigCache") myThread = threading.currentThread() self.bufferFactory = DAOFactory( package="WMComponent.DBSBuffer.Database", logger=myThread.logger, dbinterface=myThread.dbi) self.buffer3Factory = DAOFactory(package="WMComponent.DBS3Buffer", logger=myThread.logger, dbinterface=myThread.dbi) locationAction = self.bufferFactory( classname="DBSBufferFiles.AddLocation") locationAction.execute(siteName="se1.cern.ch") locationAction.execute(siteName="se1.fnal.gov") locationAction.execute(siteName="malpaquet") # Set heartbeat self.componentName = 'JobSubmitter' self.heartbeatAPI = HeartbeatAPI(self.componentName) self.heartbeatAPI.registerComponent() # Set up a config cache configCache = ConfigCache(os.environ["COUCHURL"], couchDBName=self.couchDB) configCache.createUserGroup(groupname="testGroup", username='******') self.testDir = self.testInit.generateWorkDir() psetPath = os.path.join(self.testDir, "PSet.txt") f = open(psetPath, 'w') f.write(self.configString) f.close() configCache.addConfig(newConfig=psetPath, psetHash=None) configCache.save() self.configURL = "%s;;%s;;%s" % (os.environ["COUCHURL"], self.couchDB, configCache.getCouchID()) return def tearDown(self): """ _tearDown_ tearDown function for unittest """ self.testInit.clearDatabase() self.testInit.tearDownCouch() self.testInit.delWorkDir() return def createConfig(self): """ _createConfig_ This creates the actual config file used by the component """ config = Configuration() #First the general stuff config.section_("General") config.General.workDir = os.getenv("TESTDIR", os.getcwd()) config.section_("Agent") config.Agent.componentName = 'DBSUpload' config.Agent.useHeartbeat = False #Now the CoreDatabase information #This should be the dialect, dburl, etc config.section_("CoreDatabase") config.CoreDatabase.connectUrl = os.getenv("DATABASE") config.CoreDatabase.socket = os.getenv("DBSOCK") config.component_("DBSUpload") config.DBSUpload.pollInterval = 10 config.DBSUpload.logLevel = 'ERROR' config.DBSUpload.maxThreads = 1 config.DBSUpload.namespace = 'WMComponent.DBSUpload.DBSUpload' config.DBSUpload.componentDir = os.path.join(os.getcwd(), 'Components') config.DBSUpload.workerThreads = 4 config.section_("DBSInterface") config.DBSInterface.globalDBSUrl = 'http://vocms09.cern.ch:8880/cms_dbs_int_local_xx_writer/servlet/DBSServlet' config.DBSInterface.globalDBSVersion = 'DBS_2_0_9' config.DBSInterface.DBSUrl = 'http://vocms09.cern.ch:8880/cms_dbs_int_local_yy_writer/servlet/DBSServlet' config.DBSInterface.DBSVersion = 'DBS_2_0_9' config.DBSInterface.MaxFilesToCommit = 10 # addition for Alerts messaging framework, work (alerts) and control # channel addresses to which the component will be sending alerts # these are destination addresses where AlertProcessor:Receiver listens config.section_("Alert") config.Alert.address = "tcp://127.0.0.1:5557" config.Alert.controlAddr = "tcp://127.0.0.1:5559" # configure threshold of DBS upload queue size alert threshold # reference: trac ticket #1628 config.DBSUpload.alertUploadQueueSize = 2000 return config def injectWorkflow(self, workflowName='TestWorkflow', taskPath='/TestWorkflow/ReadingEvents', MaxWaitTime=10000, MaxFiles=10, MaxEvents=250000000, MaxSize=9999999999): """ _injectWorklow_ Inject a dummy worklow in DBSBuffer for testing, returns the workflow ID """ injectWorkflowDAO = self.buffer3Factory("InsertWorkflow") workflowID = injectWorkflowDAO.execute(workflowName, taskPath, MaxWaitTime, MaxFiles, MaxEvents, MaxSize) return workflowID def getFiles(self, name, tier, nFiles=12, site="malpaquet", workflowName=None, taskPath=None, noChild=False): """ Create some quick dummy test files """ if workflowName is not None and taskPath is not None: workflowId = self.injectWorkflow(workflowName=workflowName, taskPath=taskPath) else: workflowId = self.injectWorkflow() files = [] for f in range(0, nFiles): testFile = DBSBufferFile(lfn='%s-%s-%i' % (name, site, f), size=1024, events=20, checksums={'cksum': 1}, workflowId=workflowId) testFile.setAlgorithm(appName=name, appVer="CMSSW_3_1_1", appFam="RECO", psetHash="GIBBERISH", configContent=self.configURL) testFile.setDatasetPath("/%s/%s/%s" % (name, name, tier)) testFile.addRun(Run(1, *[f])) testFile.setGlobalTag("aGlobalTag") testFile.create() testFile.setLocation(site) files.append(testFile) if not noChild: testFileChild = DBSBufferFile(lfn='%s-%s-child' % (name, site), size=1024, events=10, checksums={'cksum': 1}, workflowId=workflowId) testFileChild.setAlgorithm(appName=name, appVer="CMSSW_3_1_1", appFam="RECO", psetHash="GIBBERISH", configContent=self.configURL) testFileChild.setDatasetPath("/%s/%s_2/RECO" % (name, name)) testFileChild.addRun(Run(1, *[45])) testFileChild.setGlobalTag("aGlobalTag") testFileChild.create() testFileChild.setLocation(site) testFileChild.addParents([x['lfn'] for x in files]) return files @attr('integration') def testA_basicUploadTest(self): """ _basicUploadTest_ Do everything simply once Create dataset, algo, files, blocks, upload them, mark as done, finish them, migrate them Also check the timeout """ myThread = threading.currentThread() config = self.createConfig() self.injectWorkflow(MaxWaitTime=3) config.DBSUpload.pollInterval = 4 name = "ThisIsATest_%s" % (makeUUID()) tier = "RECO" nFiles = 12 files = self.getFiles(name=name, tier=tier, nFiles=nFiles) datasetPath = '/%s/%s/%s' % (name, name, tier) # Load components that are necessary to check status factory = WMFactory("dbsUpload", "WMComponent.DBSUpload.Database.Interface") dbinterface = factory.loadObject("UploadToDBS") dbsInterface = DBSInterface(config=config) localAPI = dbsInterface.getAPIRef() globeAPI = dbsInterface.getAPIRef(globalRef=True) # In the first round we should create blocks for the first dataset # The child dataset should not be handled until the parent is uploaded testDBSUpload = DBSUploadPoller(config=config) testDBSUpload.algorithm() # First, see if there are any blocks # One in DBS, one not in DBS result = myThread.dbi.processData( "SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(len(result), 2) self.assertEqual(result, [('InGlobalDBS', ), ('Open', )]) # Check to see if datasets and algos are in local DBS result = listAlgorithms(apiRef=localAPI, patternExe=name) self.assertEqual(len(result), 1) self.assertEqual(result[0]['ExecutableName'], name) result = listPrimaryDatasets(apiRef=localAPI, match=name) self.assertEqual(result, [name]) result = listProcessedDatasets(apiRef=localAPI, primary=name, dataTier="*") # Then check and see that the closed block made it into local DBS affectedBlocks = listBlocks(apiRef=localAPI, datasetPath=datasetPath) if affectedBlocks[0]['OpenForWriting'] == '0': self.assertEqual(affectedBlocks[1]['OpenForWriting'], '1') self.assertEqual(affectedBlocks[0]['NumberOfFiles'], 10) self.assertEqual(affectedBlocks[1]['NumberOfFiles'], 2) else: self.assertEqual(affectedBlocks[0]['OpenForWriting'], '1') self.assertEqual(affectedBlocks[1]['NumberOfFiles'], 10) self.assertEqual(affectedBlocks[0]['NumberOfFiles'], 2) # Check to make sure all the files are in local result = listDatasetFiles(apiRef=localAPI, datasetPath=datasetPath) fileLFNs = [x['lfn'] for x in files] for lfn in fileLFNs: self.assertTrue(lfn in result) # Make sure the child files aren't there flag = False try: listDatasetFiles(apiRef=localAPI, datasetPath='/%s/%s_2/%s' % (name, name, tier)) except Exception as ex: flag = True self.assertTrue(flag) # There should be one blocks in global # It should have ten files and be closed result = listBlocks(apiRef=globeAPI, datasetPath=datasetPath) self.assertEqual(len(result), 1) for block in result: self.assertEqual(block['OpenForWriting'], '0') self.assertTrue(block['NumberOfFiles'] in [2, 10]) # Okay, deep breath. First round done # In the second round, the second block of the parent fileset should transfer # Make sure that the timeout functions work time.sleep(10) testDBSUpload.algorithm() result = myThread.dbi.processData( "SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(len(result), 2) self.assertEqual(result, [('InGlobalDBS', ), ('InGlobalDBS', )]) # Check to make sure all the files are in global result = listDatasetFiles(apiRef=globeAPI, datasetPath=datasetPath) for lfn in fileLFNs: self.assertTrue(lfn in result) # Make sure the child files aren't there flag = False try: listDatasetFiles(apiRef=localAPI, datasetPath='/%s/%s_2/%s' % (name, name, tier)) except Exception as ex: flag = True self.assertTrue(flag) # Third round # Both of the parent blocks should have transferred # So the child block should now transfer testDBSUpload.algorithm() result = myThread.dbi.processData( "SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(result, [('InGlobalDBS', ), ('InGlobalDBS', ), ('Open', )]) flag = False try: result = listDatasetFiles(apiRef=localAPI, datasetPath='/%s/%s_2/%s' % (name, name, tier)) except Exception as ex: flag = True self.assertFalse(flag) self.assertEqual(len(result), 1) return @attr('integration') def testB_AlgoMigration(self): """ _AlgoMigration_ Test our ability to migrate multiple algos to global Do this by creating, mid-poll, two separate batches of files One with the same dataset but a different algo One with the same algo, but a different dataset See that they both get to global """ #raise nose.SkipTest myThread = threading.currentThread() config = self.createConfig() self.injectWorkflow(MaxWaitTime=20) name = "ThisIsATest_%s" % (makeUUID()) tier = "RECO" nFiles = 12 files = self.getFiles(name=name, tier=tier, nFiles=nFiles) datasetPath = '/%s/%s/%s' % (name, name, tier) # Load components that are necessary to check status factory = WMFactory("dbsUpload", "WMComponent.DBSUpload.Database.Interface") dbinterface = factory.loadObject("UploadToDBS") dbsInterface = DBSInterface(config=config) localAPI = dbsInterface.getAPIRef() globeAPI = dbsInterface.getAPIRef(globalRef=True) testDBSUpload = DBSUploadPoller(config=config) testDBSUpload.algorithm() # There should now be one block result = listBlocks(apiRef=globeAPI, datasetPath=datasetPath) self.assertEqual(len(result), 1) # Okay, by now, the first migration should have gone through. # Now create a second batch of files with the same dataset # but a different algo. for i in range(0, nFiles): testFile = DBSBufferFile(lfn='%s-batch2-%i' % (name, i), size=1024, events=20, checksums={'cksum': 1}, locations="malpaquet") testFile.setAlgorithm(appName="cmsRun", appVer="CMSSW_3_1_1", appFam=tier, psetHash="GIBBERISH_PART2", configContent=self.configURL) testFile.setDatasetPath(datasetPath) testFile.addRun(Run(1, *[46])) testFile.create() # Have to do things twice to get parents testDBSUpload.algorithm() testDBSUpload.algorithm() # There should now be two blocks result = listBlocks(apiRef=globeAPI, datasetPath=datasetPath) self.assertEqual(len(result), 2) # Now create another batch of files with the original algo # But in a different dataset for i in range(0, nFiles): testFile = DBSBufferFile(lfn='%s-batch3-%i' % (name, i), size=1024, events=20, checksums={'cksum': 1}, locations="malpaquet") testFile.setAlgorithm(appName=name, appVer="CMSSW_3_1_1", appFam=tier, psetHash="GIBBERISH", configContent=self.configURL) testFile.setDatasetPath('/%s/%s_3/%s' % (name, name, tier)) testFile.addRun(Run(1, *[46])) testFile.create() # Do it twice for parentage. testDBSUpload.algorithm() testDBSUpload.algorithm() # There should now be one block result = listBlocks(apiRef=globeAPI, datasetPath='/%s/%s_3/%s' % (name, name, tier)) self.assertEqual(len(result), 1) # Well, all the blocks got there, so we're done return @attr('integration') def testC_FailTest(self): """ _FailTest_ THIS TEST IS DANGEROUS! Figure out what happens when we trigger rollbacks """ myThread = threading.currentThread() config = self.createConfig() config.DBSUpload.abortStepTwo = True originalOut = sys.stdout originalErr = sys.stderr dbsInterface = DBSInterface(config=config) localAPI = dbsInterface.getAPIRef() globeAPI = dbsInterface.getAPIRef(globalRef=True) name = "ThisIsATest_%s" % (makeUUID()) tier = "RECO" nFiles = 12 files = self.getFiles(name=name, tier=tier, nFiles=nFiles) datasetPath = '/%s/%s/%s' % (name, name, tier) testDBSUpload = DBSUploadPoller(config=config) try: testDBSUpload.algorithm() except Exception as ex: pass # Aborting in step two should result in no results result = myThread.dbi.processData( "SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(len(result), 0) config.DBSUpload.abortStepTwo = False config.DBSUpload.abortStepThree = True testDBSUpload = DBSUploadPoller(config=config) try: testDBSUpload.algorithm() except Exception as ex: pass result = myThread.dbi.processData( "SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(result, [('Pending', ), ('Open', )]) result = myThread.dbi.processData( "SELECT status FROM dbsbuffer_file WHERE dataset_algo = 1" )[0].fetchall() for res in result: self.assertEqual(res[0], 'READY') config.DBSUpload.abortStepThree = False self.injectWorkflow(MaxWaitTime=300) testDBSUpload = DBSUploadPoller(config=config) testDBSUpload.algorithm() # After this, one block should have been uploaded, one should still be open # This is the result of the pending block updating, and the open block staying open result = myThread.dbi.processData( "SELECT status, id FROM dbsbuffer_block")[0].fetchall() self.assertEqual(result, [('InGlobalDBS', 3), ('Open', 4)]) # Check that one block got there result = listBlocks(apiRef=globeAPI, datasetPath=datasetPath) self.assertEqual(len(result), 1) self.assertEqual(result[0]['NumberOfFiles'], 10) self.assertEqual(result[0]['NumberOfEvents'], 200) self.assertEqual(result[0]['BlockSize'], 10240) # Check that ten files got there result = listDatasetFiles(apiRef=globeAPI, datasetPath=datasetPath) self.assertEqual(len(result), 10) myThread.dbi.processData( "UPDATE dbsbuffer_workflow SET block_close_max_wait_time = 1") testDBSUpload = DBSUploadPoller(config=config) time.sleep(3) testDBSUpload.algorithm() result = myThread.dbi.processData( "SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(result, [('InGlobalDBS', ), ('InGlobalDBS', )]) result = listDatasetFiles(apiRef=globeAPI, datasetPath=datasetPath) self.assertEqual(len(result), 12) fileLFNs = [x['lfn'] for x in files] for lfn in fileLFNs: self.assertTrue(lfn in result) testDBSUpload.algorithm() result = myThread.dbi.processData( "SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(result, [('InGlobalDBS', ), ('InGlobalDBS', ), ('Open', )]) time.sleep(5) testDBSUpload.algorithm() time.sleep(2) result = myThread.dbi.processData( "SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(result, [('InGlobalDBS', ), ('InGlobalDBS', ), ('InGlobalDBS', )]) result = listDatasetFiles(apiRef=globeAPI, datasetPath='/%s/%s_2/%s' % (name, name, tier)) self.assertEqual(len(result), 1) sys.stdout = originalOut sys.stderr = originalErr return @attr('integration') def testD_Profile(self): """ _Profile_ Profile with cProfile and time various pieces """ return config = self.createConfig() name = "ThisIsATest_%s" % (makeUUID()) tier = "RECO" nFiles = 500 files = self.getFiles(name=name, tier=tier, nFiles=nFiles) datasetPath = '/%s/%s/%s' % (name, name, tier) testDBSUpload = DBSUploadPoller(config=config) cProfile.runctx("testDBSUpload.algorithm()", globals(), locals(), filename="testStats.stat") p = pstats.Stats('testStats.stat') p.sort_stats('cumulative') p.print_stats(0.2) return @attr('integration') def testE_NoMigration(self): """ _NoMigration_ Test the DBSUpload system with no global migration """ myThread = threading.currentThread() config = self.createConfig() self.injectWorkflow(MaxWaitTime=3) config.DBSInterface.doGlobalMigration = False config.DBSUpload.pollInterval = 4 name = "ThisIsATest_%s" % (makeUUID()) tier = "RECO" nFiles = 12 files = self.getFiles(name=name, tier=tier, nFiles=nFiles) datasetPath = '/%s/%s/%s' % (name, name, tier) # Load components that are necessary to check status factory = WMFactory("dbsUpload", "WMComponent.DBSUpload.Database.Interface") dbinterface = factory.loadObject("UploadToDBS") dbsInterface = DBSInterface(config=config) localAPI = dbsInterface.getAPIRef() globeAPI = dbsInterface.getAPIRef(globalRef=True) # In the first round we should create blocks for the first dataset # The child dataset should not be handled until the parent is uploaded testDBSUpload = DBSUploadPoller(config=config) testDBSUpload.algorithm() # First, see if there are any blocks # One in DBS, one not in DBS result = myThread.dbi.processData( "SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(len(result), 2) self.assertEqual(result, [('InGlobalDBS', ), ('Open', )]) result = myThread.dbi.processData( "SELECT status FROM dbsbuffer_file WHERE dataset_algo = 1" )[0].fetchall() for r in result: self.assertEqual(r[0], 'GLOBAL') return @attr('integration') def testF_DBSUploadQueueSizeCheckForAlerts(self): """ Test will not trigger a real alert being sent unless doing some mocking of the methods used during DBSUploadPoller.algorithm() -> DBSUploadPoller.uploadBlocks() method. As done here, it probably can't be deterministic, yet the feature shall be checked. """ sizeLevelToTest = 1 myThread = threading.currentThread() config = self.createConfig() # threshold / value to check config.DBSUpload.alertUploadQueueSize = sizeLevelToTest # without this uploadBlocks method returns immediately name = "ThisIsATest_%s" % (makeUUID()) tier = "RECO" nFiles = sizeLevelToTest + 1 files = self.getFiles(name=name, tier=tier, nFiles=nFiles) datasetPath = '/%s/%s/%s' % (name, name, tier) # load components that are necessary to check status # (this seems necessary, else some previous tests started failing) factory = WMFactory("dbsUpload", "WMComponent.DBSUpload.Database.Interface") dbinterface = factory.loadObject("UploadToDBS") dbsInterface = DBSInterface(config=config) localAPI = dbsInterface.getAPIRef() globeAPI = dbsInterface.getAPIRef(globalRef=True) testDBSUpload = DBSUploadPoller(config) # this is finally where the action (alert) should be triggered from testDBSUpload.algorithm() return def testG_closeSettingsPerWorkflow(self): """ _closeSettingsPerWorkflow_ Test our ability to close blocks depending on settings configured for individual workflows. This unit test that doesn't require an actual DBS instance to run. """ self.assertTrue( False, 'This unit test disabled since we do not have DBS2 mock') myThread = threading.currentThread() config = self.createConfig() config.DBSInterface.doGlobalMigration = False # First test, limit by number of files and timeout without new files name = "ThisIsATest_%s" % (makeUUID()) tier = "RECO" nFiles = 12 self.injectWorkflow(workflowName=name, taskPath='/%s/Test' % name, MaxFiles=5) self.getFiles(name=name, tier=tier, nFiles=nFiles, workflowName=name, taskPath='/%s/Test' % name) # Load components that are necessary to check status factory = WMFactory("dbsUpload", "WMComponent.DBSUpload.Database.Interface") dbinterface = factory.loadObject("UploadToDBS") # Change the DBSUploadPoller imports on runtime from WMComponent.DBSUpload import DBSUploadPoller as MockDBSUploadPoller #MockDBSUploadPoller.DBSInterface = DBS2Interface # In the first round we should create blocks for the first dataset # The child dataset should not be handled until the parent is uploaded # First run creates 3 blocks, 2 are closed immediately and one is open testDBSUpload = MockDBSUploadPoller.DBSUploadPoller(config=config) testDBSUpload.algorithm() openBlocks = dbinterface.findOpenBlocks() closedBlocks = myThread.dbi.processData( "SELECT id FROM dbsbuffer_block WHERE status = 'InGlobalDBS'" )[0].fetchall() self.assertEqual(len(openBlocks), 1) self.assertEqual(len(closedBlocks), 2) globalFiles = myThread.dbi.processData( "SELECT id FROM dbsbuffer_file WHERE status = 'GLOBAL'" )[0].fetchall() notUploadedFiles = myThread.dbi.processData( "SELECT * FROM dbsbuffer_file WHERE status = 'NOTUPLOADED'" )[0].fetchall() self.assertEqual(len(globalFiles), 12) self.assertEqual(len(notUploadedFiles), 1) self.assertTrue('child' in notUploadedFiles[0][1]) testDBSUpload.algorithm() openBlocks = myThread.dbi.processData( "SELECT id FROM dbsbuffer_block WHERE status != 'InGlobalDBS'" )[0].fetchall() closedBlocks = myThread.dbi.processData( "SELECT id FROM dbsbuffer_block WHERE status = 'InGlobalDBS'" )[0].fetchall() self.assertEqual(len(openBlocks), 2) self.assertEqual(len(closedBlocks), 2) globalFiles = myThread.dbi.processData( "SELECT id FROM dbsbuffer_file WHERE status = 'GLOBAL'" )[0].fetchall() notUploadedFiles = myThread.dbi.processData( "SELECT * FROM dbsbuffer_file WHERE status = 'NOTUPLOADED'" )[0].fetchall() self.assertEqual(len(globalFiles), 13) self.assertEqual(len(notUploadedFiles), 0) # Test the timeout feature to close blocks myThread.dbi.processData( "UPDATE dbsbuffer_workflow SET block_close_max_wait_time = 0") testDBSUpload.algorithm() openBlocks = myThread.dbi.processData( "SELECT id FROM dbsbuffer_block WHERE status != 'InGlobalDBS'" )[0].fetchall() closedBlocks = myThread.dbi.processData( "SELECT id FROM dbsbuffer_block WHERE status = 'InGlobalDBS'" )[0].fetchall() self.assertEqual(len(openBlocks), 0) self.assertEqual(len(closedBlocks), 4) # Check the information that DBS received dbsBlocks = testDBSUpload.dbsInterface.blocks for dbsBlockName in dbsBlocks: dbsBlock = dbsBlocks[dbsBlockName] self.assertEqual(dbsBlock['OpenForWriting'], '0') self.assertTrue(dbsBlock['nFiles'] in (1, 2, 5)) # Second test, limit by number of events and timeout with new files name = "ThisIsATest_%s" % (makeUUID()) nFiles = 50 self.injectWorkflow(workflowName=name, taskPath='/%s/Test' % name, MaxFiles=45, MaxEvents=800, MaxWaitTime=10000) self.getFiles(name=name, tier=tier, nFiles=nFiles, workflowName=name, taskPath='/%s/Test' % name) testDBSUpload.algorithm() testDBSUpload.algorithm() openBlocks = myThread.dbi.processData( "SELECT id FROM dbsbuffer_block WHERE status != 'InGlobalDBS'" )[0].fetchall() closedBlocks = myThread.dbi.processData( "SELECT id FROM dbsbuffer_block WHERE status = 'InGlobalDBS'" )[0].fetchall() self.assertEqual(len(openBlocks), 2) self.assertEqual(len(closedBlocks), 5) # Throw 20 new file # Reset the timer such that the blocks appear to have been created 10001 seconds ago creationTime = int(time.time() - 10001) myThread.dbi.processData( "UPDATE dbsbuffer_block SET create_time = %d WHERE status != 'InGlobalDBS'" % creationTime) self.getFiles(name=name + '2', tier=tier, nFiles=20, workflowName=name, taskPath='/%s/Test' % name, noChild=True) # Now a new block will have to be created as the last one timed out testDBSUpload.algorithm() openBlocks = myThread.dbi.processData( "SELECT id FROM dbsbuffer_block WHERE status != 'InGlobalDBS'" )[0].fetchall() closedBlocks = myThread.dbi.processData( "SELECT id FROM dbsbuffer_block WHERE status = 'InGlobalDBS'" )[0].fetchall() self.assertEqual(len(openBlocks), 1) self.assertEqual(len(closedBlocks), 7) dbsBlocks = testDBSUpload.dbsInterface.blocks for dbsBlockName in dbsBlocks: dbsBlock = dbsBlocks[dbsBlockName] if name in dbsBlockName: if dbsBlock['OpenForWriting'] == '1': self.assertEqual(dbsBlock['nFiles'], 20) else: self.assertTrue(dbsBlock['events'] in (10, 200, 800)) self.assertTrue(dbsBlock['nFiles'] in (1, 10, 40)) # Last test, check limitation by size name = "ThisIsATest_%s" % (makeUUID()) nFiles = 10 self.injectWorkflow(workflowName=name, taskPath='/%s/Test' % name, MaxFiles=45, MaxEvents=800, MaxSize=2048) self.getFiles(name=name, tier=tier, nFiles=nFiles, workflowName=name, taskPath='/%s/Test' % name) testDBSUpload.algorithm() dbsBlocks = testDBSUpload.dbsInterface.blocks for dbsBlockName in dbsBlocks: dbsBlock = dbsBlocks[dbsBlockName] if name in dbsBlockName: self.assertEqual(dbsBlock['events'], 40) self.assertEqual(dbsBlock['nFiles'], 2) self.assertEqual(dbsBlock['size'], 2048) return
def testAddComponent(self): """ _testAddComponent_ Test creation of components and worker threads as well as the get heartbeat DAOs """ comp1 = HeartbeatAPI("testComponent1", pollInterval=60, heartbeatTimeout=600) comp1.registerComponent() self.assertEqual(comp1.getHeartbeatInfo(), []) # no worker thread yet comp1.registerWorker("testWorker1") self.assertEqual(len(comp1.getHeartbeatInfo()), 1) comp1.registerWorker("testWorker2") self.assertEqual(len(comp1.getHeartbeatInfo()), 2) comp2 = HeartbeatAPI("testComponent2", pollInterval=30, heartbeatTimeout=300) comp2.registerComponent() self.assertEqual(comp2.getHeartbeatInfo(), []) # no worker thread yet self.assertEqual(len(comp2.getAllHeartbeatInfo()), 2) comp2.registerWorker("testWorker21") self.assertEqual(len(comp2.getHeartbeatInfo()), 1) self.assertEqual(len(comp2.getAllHeartbeatInfo()), 3) comp1.updateWorkerHeartbeat("testWorker1", "Running") comp1.updateWorkerHeartbeat("testWorker2", "Running") comp2.updateWorkerHeartbeat("testWorker21", "Running") self.assertEqual(len(comp1.getAllHeartbeatInfo()), 3) self.assertEqual(len(comp2.getAllHeartbeatInfo()), 3) comp1Res = comp1.getHeartbeatInfo() comp2Res = comp2.getHeartbeatInfo() self.assertEqual(len(comp1Res), 2) self.assertEqual(len(comp2Res), 1) self.assertItemsEqual([item["name"] for item in comp1Res], ["testComponent1", "testComponent1"]) self.assertItemsEqual([item["worker_name"] for item in comp1Res], ["testWorker1", "testWorker2"]) self.assertItemsEqual([item["state"] for item in comp1Res], ["Running", "Running"]) self.assertItemsEqual([item["poll_interval"] for item in comp1Res], [60, 60]) self.assertItemsEqual([item["update_threshold"] for item in comp1Res], [600, 600]) self.assertItemsEqual([item["name"] for item in comp2Res], ["testComponent2"]) self.assertItemsEqual([item["worker_name"] for item in comp2Res], ["testWorker21"]) self.assertItemsEqual([item["state"] for item in comp2Res], ["Running"]) self.assertItemsEqual([item["poll_interval"] for item in comp2Res], [30]) self.assertItemsEqual([item["update_threshold"] for item in comp2Res], [300])
def testHeartbeat(self): testComponent = HeartbeatAPI("testComponent") testComponent.registerComponent() self.assertEqual(testComponent.getHeartbeatInfo(), []) testComponent.updateWorkerHeartbeat("testWorker") result = testComponent.getHeartbeatInfo() self.assertEqual(len(result), 1) self.assertEqual(result[0]['worker_name'], "testWorker") time.sleep(1) testComponent.updateWorkerHeartbeat("testWorker2") result = testComponent.getHeartbeatInfo() self.assertEqual(len(result), 1) self.assertEqual(result[0]['worker_name'], "testWorker2") time.sleep(1) testComponent.updateWorkerHeartbeat("testWorker") result = testComponent.getHeartbeatInfo() self.assertEqual(len(result), 1) self.assertEqual(result[0]['worker_name'], "testWorker") testComponent = HeartbeatAPI("test2Component") testComponent.registerComponent() time.sleep(1) testComponent.updateWorkerHeartbeat("test2Worker") result = testComponent.getHeartbeatInfo() self.assertEqual(len(result), 2) self.assertEqual(result[0]['worker_name'], "testWorker") self.assertEqual(result[1]['worker_name'], "test2Worker") time.sleep(1) testComponent.updateWorkerHeartbeat("test2Worker2") result = testComponent.getHeartbeatInfo() self.assertEqual(len(result), 2) self.assertEqual(result[0]['worker_name'], "testWorker") self.assertEqual(result[1]['worker_name'], "test2Worker2") time.sleep(1) testComponent.updateWorkerHeartbeat("test2Worker") result = testComponent.getHeartbeatInfo() self.assertEqual(len(result), 2) self.assertEqual(result[0]['worker_name'], "testWorker") self.assertEqual(result[1]['worker_name'], "test2Worker") testComponent.updateWorkerError("test2Worker", "Error1") result = testComponent.getHeartbeatInfo() self.assertEqual(result[1]['error_message'], "Error1")
def testUpdateWorkers(self): """ _testUpdateWorkers_ Create a couple of components and workers and test the update methods """ comp1 = HeartbeatAPI("testComponent1", pollInterval=60, heartbeatTimeout=600) comp1.registerComponent() comp1.registerWorker("testWorker1") comp1.registerWorker("testWorker2") comp2 = HeartbeatAPI("testComponent2", pollInterval=30, heartbeatTimeout=300) comp2.registerComponent() comp2.registerWorker("testWorker21") comp1.updateWorkerCycle("testWorker1", 1.001, None) comp2.updateWorkerCycle("testWorker21", 1234.1, 100) hb1 = comp1.getHeartbeatInfo() hb2 = comp2.getHeartbeatInfo() for worker in hb1: if worker['worker_name'] == 'testWorker1': self.assertTrue(worker["cycle_time"] > 1.0) else: self.assertEqual(worker["cycle_time"], 0) self.assertItemsEqual([item["outcome"] for item in hb1], [None, None]) self.assertItemsEqual([item["error_message"] for item in hb1], [None, None]) self.assertEqual(round(hb2[0]["cycle_time"], 1), 1234.1) self.assertEqual(hb2[0]["outcome"], '100') self.assertEqual(hb2[0]["error_message"], None) # time to update workers with an error comp1.updateWorkerError("testWorker2", "BAD JOB!!!") hb1 = comp1.getHeartbeatInfo() for worker in hb1: if worker['worker_name'] == 'testWorker2': self.assertTrue(worker["last_error"] > int(time.time() - 10)) self.assertEqual(worker["state"], "Error") self.assertEqual(worker["error_message"], "BAD JOB!!!")
class JobCreatorTest(unittest.TestCase): """ Test case for the JobCreator """ sites = ['T2_US_Florida', 'T2_US_UCSD', 'T2_TW_Taiwan', 'T1_CH_CERN'] def setUp(self): """ _setUp_ Setup the database and logging connection. Try to create all of the WMBS tables. Also, create some dummy locations. """ myThread = threading.currentThread() self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() #self.tearDown() self.testInit.setSchema(customModules=[ 'WMCore.WMBS', 'WMCore.ResourceControl', 'WMCore.Agent.Database' ], useDefault=False) self.couchdbname = "jobcreator_t" self.testInit.setupCouch("%s/jobs" % self.couchdbname, "JobDump") self.testInit.setupCouch("%s/fwjrs" % self.couchdbname, "FWJRDump") self.configFile = EmulatorSetup.setupWMAgentConfig() myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) locationAction = self.daoFactory(classname="Locations.New") for site in self.sites: locationAction.execute(siteName=site, seName=site) #Create sites in resourceControl resourceControl = ResourceControl() for site in self.sites: resourceControl.insertSite(siteName=site, seName=site, ceName=site) resourceControl.insertThreshold(siteName = site, taskType = 'Processing', \ maxSlots = 10000, pendingSlots = 10000) self.resourceControl = resourceControl self._setup = True self._teardown = False self.testDir = self.testInit.generateWorkDir() self.cwd = os.getcwd() # Set heartbeat self.componentName = 'JobCreator' self.heartbeatAPI = HeartbeatAPI(self.componentName) self.heartbeatAPI.registerComponent() return def tearDown(self): """ _tearDown_ Drop all the WMBS tables. """ myThread = threading.currentThread() self.testInit.clearDatabase(modules=[ 'WMCore.WMBS', 'WMCore.ResourceControl', 'WMCore.Agent.Database' ]) self.testInit.delWorkDir() self._teardown = True self.testInit.tearDownCouch() EmulatorSetup.deleteConfig(self.configFile) return def createJobCollection(self, name, nSubs, nFiles, workflowURL='test'): """ _createJobCollection_ Create a collection of jobs """ myThread = threading.currentThread() testWorkflow = Workflow(spec=workflowURL, owner="mnorman", name=name, task="/TestWorkload/ReReco") testWorkflow.create() for sub in range(nSubs): nameStr = '%s-%i' % (name, sub) myThread.transaction.begin() testFileset = Fileset(name=nameStr) testFileset.create() for f in range(nFiles): # pick a random site site = random.choice(self.sites) testFile = File(lfn="/lfn/%s/%i" % (nameStr, f), size=1024, events=10) testFile.setLocation(site) testFile.create() testFileset.addFile(testFile) testFileset.commit() testSubscription = Subscription(fileset=testFileset, workflow=testWorkflow, type="Processing", split_algo="FileBased") testSubscription.create() myThread.transaction.commit() return def createWorkload(self, workloadName='Test', emulator=True, priority=1): """ _createTestWorkload_ Creates a test workload for us to run on, hold the basic necessities. """ workload = testWorkload("Tier1ReReco") rereco = workload.getTask("ReReco") seederDict = { "generator.initialSeed": 1001, "evtgenproducer.initialSeed": 1001 } rereco.addGenerator("PresetSeeder", **seederDict) taskMaker = TaskMaker(workload, os.path.join(self.testDir, 'workloadTest')) taskMaker.skipSubscription = True taskMaker.processWorkload() return workload def getConfig(self): """ _getConfig_ Creates a common config. """ myThread = threading.currentThread() config = self.testInit.getConfiguration() self.testInit.generateWorkDir(config) #First the general stuff config.section_("General") config.General.workDir = os.getenv("TESTDIR", os.getcwd()) config.section_("Agent") config.Agent.componentName = self.componentName #Now the CoreDatabase information #This should be the dialect, dburl, etc config.section_("CoreDatabase") config.CoreDatabase.connectUrl = os.getenv("DATABASE") config.CoreDatabase.socket = os.getenv("DBSOCK") config.component_("JobCreator") config.JobCreator.namespace = 'WMComponent.JobCreator.JobCreator' #The log level of the component. #config.JobCreator.logLevel = 'SQLDEBUG' config.JobCreator.logLevel = 'INFO' # maximum number of threads we want to deal # with messages per pool. config.JobCreator.maxThreads = 1 config.JobCreator.UpdateFromResourceControl = True config.JobCreator.pollInterval = 10 #config.JobCreator.jobCacheDir = self.testDir config.JobCreator.defaultJobType = 'processing' #Type of jobs that we run, used for resource control config.JobCreator.workerThreads = 4 config.JobCreator.componentDir = self.testDir config.JobCreator.useWorkQueue = True config.JobCreator.WorkQueueParams = {'emulateDBSReader': True} # We now call the JobMaker from here config.component_('JobMaker') config.JobMaker.logLevel = 'INFO' config.JobMaker.namespace = 'WMCore.WMSpec.Makers.JobMaker' config.JobMaker.maxThreads = 1 config.JobMaker.makeJobsHandler = 'WMCore.WMSpec.Makers.Handlers.MakeJobs' #JobStateMachine config.component_('JobStateMachine') config.JobStateMachine.couchurl = os.getenv('COUCHURL', 'cmssrv52.fnal.gov:5984') config.JobStateMachine.couchDBName = self.couchdbname return config def testA_VerySimpleTest(self): """ _VerySimpleTest_ Just test that everything works...more or less """ #return myThread = threading.currentThread() config = self.getConfig() name = makeUUID() nSubs = 5 nFiles = 10 workloadName = 'TestWorkload' workload = self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') self.createJobCollection(name=name, nSubs=nSubs, nFiles=nFiles, workflowURL=workloadPath) testJobCreator = JobCreatorPoller(config=config) # First, can we run once without everything crashing? testJobCreator.algorithm() getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), nSubs * nFiles) # Count database objects result = myThread.dbi.processData( 'SELECT * FROM wmbs_sub_files_acquired')[0].fetchall() self.assertEqual(len(result), nSubs * nFiles) # Find the test directory testDirectory = os.path.join(self.testDir, 'jobCacheDir', 'TestWorkload', 'ReReco') # It should have at least one jobGroup self.assertTrue('JobCollection_1_0' in os.listdir(testDirectory)) # But no more then twenty self.assertTrue(len(os.listdir(testDirectory)) <= 20) groupDirectory = os.path.join(testDirectory, 'JobCollection_1_0') # First job should be in here listOfDirs = [] for tmpDirectory in os.listdir(testDirectory): listOfDirs.extend( os.listdir(os.path.join(testDirectory, tmpDirectory))) self.assertTrue('job_1' in listOfDirs) self.assertTrue('job_2' in listOfDirs) self.assertTrue('job_3' in listOfDirs) jobDir = os.listdir(groupDirectory)[0] jobFile = os.path.join(groupDirectory, jobDir, 'job.pkl') self.assertTrue(os.path.isfile(jobFile)) f = open(jobFile, 'r') job = cPickle.load(f) f.close() self.assertEqual(job.baggage.PresetSeeder.generator.initialSeed, 1001) self.assertEqual(job.baggage.PresetSeeder.evtgenproducer.initialSeed, 1001) self.assertEqual(job['workflow'], name) self.assertEqual(len(job['input_files']), 1) self.assertEqual(os.path.basename(job['sandbox']), 'TestWorkload-Sandbox.tar.bz2') return @attr('performance') def testB_ProfilePoller(self): """ Profile your performance You shouldn't be running this normally because it doesn't do anything """ return myThread = threading.currentThread() name = makeUUID() nSubs = 5 nFiles = 1500 workloadName = 'TestWorkload' workload = self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') self.createJobCollection(name=name, nSubs=nSubs, nFiles=nFiles, workflowURL=workloadPath) config = self.getConfig() testJobCreator = JobCreatorPoller(config=config) cProfile.runctx("testJobCreator.algorithm()", globals(), locals(), filename="testStats.stat") getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Created', jobType="Processing") time.sleep(10) self.assertEqual(len(result), nSubs * nFiles) p = pstats.Stats('testStats.stat') p.sort_stats('cumulative') p.print_stats(.2) return def testC_ProfileWorker(self): """ Profile where the work actually gets done You shouldn't be running this one either, since it doesn't test anything. """ return myThread = threading.currentThread() name = makeUUID() nSubs = 5 nFiles = 500 workloadName = 'TestWorkload' workload = self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') self.createJobCollection(name=name, nSubs=nSubs, nFiles=nFiles, workflowURL=workloadPath) config = self.getConfig() configDict = { "couchURL": config.JobStateMachine.couchurl, "couchDBName": config.JobStateMachine.couchDBName, 'jobCacheDir': config.JobCreator.jobCacheDir, 'defaultJobType': config.JobCreator.defaultJobType } input = [{ "subscription": 1 }, { "subscription": 2 }, { "subscription": 3 }, { "subscription": 4 }, { "subscription": 5 }] testJobCreator = JobCreatorPoller(**configDict) cProfile.runctx("testJobCreator.algorithm(parameters = input)", globals(), locals(), filename="workStats.stat") p = pstats.Stats('workStats.stat') p.sort_stats('cumulative') p.print_stats(.2) return def testD_HugeTest(self): """ Don't run this one either """ return myThread = threading.currentThread() config = self.getConfig() name = makeUUID() nSubs = 10 nFiles = 5000 workloadName = 'Tier1ReReco' workload = self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') self.createJobCollection(name=name, nSubs=nSubs, nFiles=nFiles, workflowURL=workloadPath) testJobCreator = JobCreatorPoller(config=config) # First, can we run once without everything crashing? startTime = time.time() testJobCreator.algorithm() stopTime = time.time() getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), nSubs * nFiles) print("Job took %f seconds to run" % (stopTime - startTime)) # Count database objects result = myThread.dbi.processData( 'SELECT * FROM wmbs_sub_files_acquired')[0].fetchall() self.assertEqual(len(result), nSubs * nFiles) return def stuffWMBS(self, workflowURL, name): """ _stuffWMBS_ Insert some dummy jobs, jobgroups, filesets, files and subscriptions into WMBS to test job creation. Three completed job groups each containing several files are injected. Another incomplete job group is also injected. Also files are added to the "Mergeable" subscription as well as to the output fileset for their jobgroups. """ locationAction = self.daoFactory(classname="Locations.New") locationAction.execute(siteName="s1", seName="somese.cern.ch") changeStateDAO = self.daoFactory(classname="Jobs.ChangeState") mergeFileset = Fileset(name="mergeFileset") mergeFileset.create() bogusFileset = Fileset(name="bogusFileset") bogusFileset.create() mergeWorkflow = Workflow(spec=workflowURL, owner="mnorman", name=name, task="/TestWorkload/ReReco") mergeWorkflow.create() mergeSubscription = Subscription(fileset=mergeFileset, workflow=mergeWorkflow, split_algo="ParentlessMergeBySize") mergeSubscription.create() bogusSubscription = Subscription(fileset=bogusFileset, workflow=mergeWorkflow, split_algo="ParentlessMergeBySize") file1 = File(lfn="file1", size=1024, events=1024, first_event=0, locations=set(["somese.cern.ch"])) file1.addRun(Run(1, *[45])) file1.create() file2 = File(lfn="file2", size=1024, events=1024, first_event=1024, locations=set(["somese.cern.ch"])) file2.addRun(Run(1, *[45])) file2.create() file3 = File(lfn="file3", size=1024, events=1024, first_event=2048, locations=set(["somese.cern.ch"])) file3.addRun(Run(1, *[45])) file3.create() file4 = File(lfn="file4", size=1024, events=1024, first_event=3072, locations=set(["somese.cern.ch"])) file4.addRun(Run(1, *[45])) file4.create() fileA = File(lfn="fileA", size=1024, events=1024, first_event=0, locations=set(["somese.cern.ch"])) fileA.addRun(Run(1, *[46])) fileA.create() fileB = File(lfn="fileB", size=1024, events=1024, first_event=1024, locations=set(["somese.cern.ch"])) fileB.addRun(Run(1, *[46])) fileB.create() fileC = File(lfn="fileC", size=1024, events=1024, first_event=2048, locations=set(["somese.cern.ch"])) fileC.addRun(Run(1, *[46])) fileC.create() fileI = File(lfn="fileI", size=1024, events=1024, first_event=0, locations=set(["somese.cern.ch"])) fileI.addRun(Run(2, *[46])) fileI.create() fileII = File(lfn="fileII", size=1024, events=1024, first_event=1024, locations=set(["somese.cern.ch"])) fileII.addRun(Run(2, *[46])) fileII.create() fileIII = File(lfn="fileIII", size=1024, events=102400, first_event=2048, locations=set(["somese.cern.ch"])) fileIII.addRun(Run(2, *[46])) fileIII.create() fileIV = File(lfn="fileIV", size=102400, events=1024, first_event=3072, locations=set(["somese.cern.ch"])) fileIV.addRun(Run(2, *[46])) fileIV.create() for file in [ file1, file2, file3, file4, fileA, fileB, fileC, fileI, fileII, fileIII, fileIV ]: mergeFileset.addFile(file) bogusFileset.addFile(file) mergeFileset.commit() bogusFileset.commit() return def testE_TestNonProxySplitting(self): """ _TestNonProxySplitting_ Test and see if we can split things without a proxy. """ myThread = threading.currentThread() config = self.getConfig() config.JobCreator.workerThreads = 1 name = makeUUID() workloadName = 'TestWorkload' workload = self.createWorkload(workloadName=workloadName) # Change the file splitting algo procTask = workload.getTask("ReReco") procTask.setSplittingAlgorithm("ParentlessMergeBySize", min_merge_size=1, max_merge_size=100000, max_merge_events=200000) workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') self.stuffWMBS(workflowURL=workloadPath, name=name) testJobCreator = JobCreatorPoller(config=config) testJobCreator.algorithm() getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 1) result = getJobsAction.execute(state='Created', jobType="Merge") self.assertEqual(len(result), 0) return
class JobSubmitterTest(unittest.TestCase): """ _JobSubmitterTest_ Test class for the JobSubmitterPoller """ def setUp(self): """ _setUp_ Standard setup: Now with 100% more couch """ self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules = ["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"]) self.testInit.setupCouch("jobsubmitter_t/jobs", "JobDump") self.testInit.setupCouch("jobsubmitter_t/fwjrs", "FWJRDump") self.testInit.setupCouch("wmagent_summary_t", "WMStats") myThread = threading.currentThread() self.daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) self.baDaoFactory = DAOFactory(package = "WMCore.BossAir", logger = myThread.logger, dbinterface = myThread.dbi) self.testDir = self.testInit.generateWorkDir() # Set heartbeat self.componentName = 'JobSubmitter' self.heartbeatAPI = HeartbeatAPI(self.componentName) self.heartbeatAPI.registerComponent() return def tearDown(self): """ _tearDown_ Standard tearDown """ self.testInit.clearDatabase() self.testInit.delWorkDir() self.testInit.tearDownCouch() return def setResourceThresholds(self, site, **options): """ _setResourceThresholds_ Utility to set resource thresholds """ if not options: options = {'state' : 'Normal', 'runningSlots' : 10, 'pendingSlots' : 5, 'tasks' : ['Processing', 'Merge'], 'Processing' : {'pendingSlots' : 5, 'runningSlots' : 10}, 'Merge' : {'pendingSlots' : 2, 'runningSlots' : 5}} resourceControl = ResourceControl() resourceControl.insertSite(siteName = site, seName = 'se.%s' % (site), ceName = site, plugin = "MockPlugin", pendingSlots = options['pendingSlots'], runningSlots = options['runningSlots'], cmsName = site) for task in options['tasks']: resourceControl.insertThreshold(siteName = site, taskType = task, maxSlots = options[task]['runningSlots'], pendingSlots = options[task]['pendingSlots']) if options.get('state'): resourceControl.changeSiteState(site, options.get('state')) return def createJobGroups(self, nSubs, nJobs, task, workloadSpec, site, bl = [], wl = [], taskType = 'Processing', name = None): """ _createJobGroups_ Creates a series of jobGroups for submissions """ jobGroupList = [] if name is None: name = makeUUID() testWorkflow = Workflow(spec = workloadSpec, owner = "mnorman", name = name, task = "basicWorkload/Production") testWorkflow.create() # Create subscriptions for _ in range(nSubs): name = makeUUID() # Create Fileset, Subscription, jobGroup testFileset = Fileset(name = name) testFileset.create() testSubscription = Subscription(fileset = testFileset, workflow = testWorkflow, type = taskType, split_algo = "FileBased") testSubscription.create() testJobGroup = JobGroup(subscription = testSubscription) testJobGroup.create() # Create jobs self.makeNJobs(name = name, task = task, nJobs = nJobs, jobGroup = testJobGroup, fileset = testFileset, sub = testSubscription.exists(), site = site, bl = bl, wl = wl) testFileset.commit() testJobGroup.commit() jobGroupList.append(testJobGroup) return jobGroupList def makeNJobs(self, name, task, nJobs, jobGroup, fileset, sub, site, bl = [], wl = []): """ _makeNJobs_ Make and return a WMBS Job and File This handles all those damn add-ons """ # Set the CacheDir cacheDir = os.path.join(self.testDir, 'CacheDir') for n in range(nJobs): # First make a file #site = self.sites[0] testFile = File(lfn = "/singleLfn/%s/%s" % (name, n), size = 1024, events = 10) if type(site) == list: for singleSite in site: testFile.setLocation(singleSite) else: testFile.setLocation(site) testFile.create() fileset.addFile(testFile) fileset.commit() index = 0 for f in fileset.files: index += 1 testJob = Job(name = '%s-%i' % (name, index)) testJob.addFile(f) testJob["location"] = f.getLocations()[0] testJob['task'] = task.getPathName() testJob['sandbox'] = task.data.input.sandbox testJob['spec'] = os.path.join(self.testDir, 'basicWorkload.pcl') testJob['mask']['FirstEvent'] = 101 testJob["siteBlacklist"] = bl testJob["siteWhitelist"] = wl testJob['priority'] = 101 jobCache = os.path.join(cacheDir, 'Sub_%i' % (sub), 'Job_%i' % (index)) os.makedirs(jobCache) testJob.create(jobGroup) testJob['cache_dir'] = jobCache testJob.save() jobGroup.add(testJob) output = open(os.path.join(jobCache, 'job.pkl'), 'w') pickle.dump(testJob, output) output.close() return testJob, testFile def getConfig(self): """ _getConfig_ Gets a basic config from default location """ config = Configuration() config.component_("Agent") config.Agent.WMSpecDirectory = self.testDir config.Agent.agentName = 'testAgent' config.Agent.componentName = self.componentName config.Agent.useHeartbeat = False #First the general stuff config.section_("General") config.General.workDir = os.getenv("TESTDIR", self.testDir) #Now the CoreDatabase information config.section_("CoreDatabase") config.CoreDatabase.connectUrl = os.getenv("DATABASE") config.CoreDatabase.socket = os.getenv("DBSOCK") # BossAir and MockPlugin configuration config.section_("BossAir") config.BossAir.pluginNames = ['MockPlugin'] config.BossAir.pluginDir = 'WMCore.BossAir.Plugins' config.BossAir.multicoreTaskTypes = ['MultiProcessing', 'MultiProduction'] config.BossAir.nCondorProcesses = 1 config.BossAir.section_("MockPlugin") config.BossAir.MockPlugin.fakeReport = os.path.join(getTestBase(), 'WMComponent_t/JobSubmitter_t', "submit.sh") # JobSubmitter configuration config.component_("JobSubmitter") config.JobSubmitter.logLevel = 'DEBUG' config.JobSubmitter.maxThreads = 1 config.JobSubmitter.pollInterval = 10 config.JobSubmitter.submitScript = os.path.join(getTestBase(), 'WMComponent_t/JobSubmitter_t', 'submit.sh') config.JobSubmitter.componentDir = os.path.join(self.testDir, 'Components') config.JobSubmitter.workerThreads = 2 config.JobSubmitter.jobsPerWorker = 200 #JobStateMachine config.component_('JobStateMachine') config.JobStateMachine.couchurl = os.getenv('COUCHURL') config.JobStateMachine.couchDBName = "jobsubmitter_t" config.JobStateMachine.jobSummaryDBName = 'wmagent_summary_t' # Needed, because this is a test os.makedirs(config.JobSubmitter.componentDir) return config def createTestWorkload(self, workloadName = 'Tier1ReReco'): """ _createTestWorkload_ Creates a test workload for us to run on, hold the basic necessities. """ workload = testWorkload(workloadName) taskMaker = TaskMaker(workload, os.path.join(self.testDir, 'workloadTest')) taskMaker.skipSubscription = True taskMaker.processWorkload() return workload def testA_BasicTest(self): """ Use the MockPlugin to create a simple test Check to see that all the jobs were "submitted", don't care about thresholds """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 2 nJobs = 20 site = 'T2_US_UCSD' self.setResourceThresholds(site, pendingSlots = 50, runningSlots = 100, tasks = ['Processing', 'Merge'], Processing = {'pendingSlots' : 50, 'runningSlots' : 100}, Merge = {'pendingSlots' : 50, 'runningSlots' : 100}) jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % site) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Do pre-submit check getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs") result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) jobSubmitter = JobSubmitterPoller(config = config) jobSubmitter.algorithm() # Check that jobs are in the right state result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) # Check assigned locations getLocationAction = self.daoFactory(classname = "Jobs.GetLocation") for jobId in result: loc = getLocationAction.execute(jobid = jobId) self.assertEqual(loc, [['T2_US_UCSD']]) # Run another cycle, it shouldn't submit anything. There isn't anything to submit jobSubmitter.algorithm() result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) nSubs = 1 nJobs = 10 # Submit another 10 jobs jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % site, taskType = "Merge") for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Check that the jobs are available for submission and run another cycle result = getJobsAction.execute(state = 'Created', jobType = "Merge") self.assertEqual(len(result), nSubs * nJobs) jobSubmitter.algorithm() #Check that the last 10 jobs were submitted as well. result = getJobsAction.execute(state = 'Created', jobType = "Merge") self.assertEqual(len(result), 0) result = getJobsAction.execute(state = 'Executing', jobType = "Merge") self.assertEqual(len(result), nSubs * nJobs) return def testB_thresholdTest(self): """ _testB_thresholdTest_ Check that the threshold management is working, this requires checks on pending/running jobs globally at a site and per task/site """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 5 nJobs = 10 sites = ['T1_US_FNAL'] for site in sites: self.setResourceThresholds(site, pendingSlots = 50, runningSlots = 200, tasks = ['Processing', 'Merge'], Processing = {'pendingSlots' : 45, 'runningSlots' :-1}, Merge = {'pendingSlots' : 10, 'runningSlots' : 20, 'priority' : 5}) # Always initialize the submitter after setting the sites, flaky! jobSubmitter = JobSubmitterPoller(config = config) jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % 'T1_US_FNAL') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Do pre-submit check getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs") result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) jobSubmitter.algorithm() # Check that jobs are in the right state, # here we are limited by the pending threshold for the Processing task (45) result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), 5) result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), 45) # Check assigned locations getLocationAction = self.daoFactory(classname = "Jobs.GetLocation") for jobId in result: loc = getLocationAction.execute(jobid = jobId) self.assertEqual(loc, [['T1_US_FNAL']]) # Run another cycle, it shouldn't submit anything. Jobs are still in pending jobSubmitter.algorithm() result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), 5) result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), 45) # Now put 10 Merge jobs, only 5 can be submitted, there we hit the global pending threshold for the site nSubs = 1 nJobs = 10 jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % 'T1_US_FNAL', taskType = 'Merge') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() result = getJobsAction.execute(state = 'Created', jobType = "Merge") self.assertEqual(len(result), 5) result = getJobsAction.execute(state = 'Executing', jobType = "Merge") self.assertEqual(len(result), 5) result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), 5) result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), 45) # Now let's test running thresholds # The scenario will be setup as follows: Move all current jobs as running # Create 300 Processing jobs and 300 merge jobs # Run 5 polling cycles, moving all pending jobs to running in between # Result is, merge is left at 25 running 0 pending and processing is left at 215 running 0 pending # Processing has 135 jobs in queue and Merge 285 # This tests all threshold dynamics including the prioritization of merge over processing nSubs = 1 nJobs = 300 jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % 'T1_US_FNAL') jobGroupList.extend(self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % 'T1_US_FNAL', taskType = 'Merge')) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') getRunJobID = self.baDaoFactory(classname = "LoadByWMBSID") setRunJobStatus = self.baDaoFactory(classname = "SetStatus") for _ in range(5): result = getJobsAction.execute(state = 'Executing') binds = [] for jobId in result: binds.append({'id' : jobId, 'retry_count' : 0}) runJobIds = getRunJobID.execute(binds) setRunJobStatus.execute([x['id'] for x in runJobIds], 'Running') jobSubmitter.algorithm() result = getJobsAction.execute(state = 'Executing', jobType = 'Processing') self.assertEqual(len(result), 215) result = getJobsAction.execute(state = 'Created', jobType = 'Processing') self.assertEqual(len(result), 135) result = getJobsAction.execute(state = 'Executing', jobType = 'Merge') self.assertEqual(len(result), 25) result = getJobsAction.execute(state = 'Created', jobType = 'Merge') self.assertEqual(len(result), 285) return def testC_prioritization(self): """ _testC_prioritization_ Check that jobs are prioritized by job type and by oldest workflow """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 1 nJobs = 10 sites = ['T1_US_FNAL'] for site in sites: self.setResourceThresholds(site, pendingSlots = 10, runningSlots = -1, tasks = ['Processing', 'Merge'], Processing = {'pendingSlots' : 50, 'runningSlots' :-1}, Merge = {'pendingSlots' : 10, 'runningSlots' :-1, 'priority' : 5}) # Always initialize the submitter after setting the sites, flaky! jobSubmitter = JobSubmitterPoller(config = config) jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % 'T1_US_FNAL', name = 'OldestWorkflow') jobGroupList.extend(self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % 'T1_US_FNAL', taskType = 'Merge')) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() # Merge goes first getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs") result = getJobsAction.execute(state = 'Created', jobType = "Merge") self.assertEqual(len(result), 0) result = getJobsAction.execute(state = 'Executing', jobType = "Merge") self.assertEqual(len(result), 10) result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), 10) result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), 0) # Create a newer workflow processing, and after some new jobs for an old workflow jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % 'T1_US_FNAL', name = 'NewestWorkflow') jobGroupList.extend(self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % 'T1_US_FNAL', name = 'OldestWorkflow')) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Move pending jobs to running getRunJobID = self.baDaoFactory(classname = "LoadByWMBSID") setRunJobStatus = self.baDaoFactory(classname = "SetStatus") for idx in range(2): result = getJobsAction.execute(state = 'Executing') binds = [] for jobId in result: binds.append({'id' : jobId, 'retry_count' : 0}) runJobIds = getRunJobID.execute(binds) setRunJobStatus.execute([x['id'] for x in runJobIds], 'Running') # Run again on created workflows jobSubmitter.algorithm() result = getJobsAction.execute(state = 'Created', jobType = "Merge") self.assertEqual(len(result), 0) result = getJobsAction.execute(state = 'Executing', jobType = "Merge") self.assertEqual(len(result), 10) result = getJobsAction.execute(state = 'Created', jobType = "Processing") self.assertEqual(len(result), 30 - (idx + 1) * 10) result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), (idx + 1) * 10) # Check that older workflow goes first even with newer jobs getWorkflowAction = self.daoFactory(classname = "Jobs.GetWorkflowTask") workflows = getWorkflowAction.execute(result) for workflow in workflows: self.assertEqual(workflow['name'], 'OldestWorkflow') return def testD_WhiteListBlackList(self): """ _testD_WhiteListBlackList_ Test the whitelist/blacklist implementation Trust the jobCreator to get this in the job right """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 2 nJobs = 10 sites = ['T2_US_Florida', 'T2_TW_Taiwan', 'T2_CH_CERN', 'T3_CO_Uniandes'] for site in sites: self.setResourceThresholds(site, pendingSlots = 1000, runningSlots = -1, tasks = ['Processing', 'Merge'], Processing = {'pendingSlots' : 5000, 'runningSlots' :-1}, Merge = {'pendingSlots' : 1000, 'runningSlots' :-1, 'priority' : 5}) jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, site = 'se.%s' % sites[-1], task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), bl = sites[:-1]) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config = config) # Actually run it jobSubmitter.algorithm() getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs") result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) # All jobs should be at T3_CO_Uniandes # Check assigned locations getLocationAction = self.daoFactory(classname = "Jobs.GetLocation") locationDict = getLocationAction.execute([{'jobid' : x} for x in result]) for entry in locationDict: loc = entry['site_name'] self.assertEqual(loc, 'T3_CO_Uniandes') # Run again and test the whiteList jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), site = 'se.%s' % 'T2_CH_CERN', workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), wl = ['T2_CH_CERN']) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Run it jobSubmitter.algorithm() # You'll have jobs from the previous run still in the database result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs * 2) # All jobs should be at CERN or Uniandes locationDict = getLocationAction.execute([{'jobid' : x} for x in result]) for entry in locationDict[nSubs * nJobs:]: loc = entry['site_name'] self.assertEqual(loc, 'T2_CH_CERN') # Run again with an invalid whitelist # After this point, the original two sets of jobs will be executing # The rest of the jobs should move to submitFailed jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), site = 'se.%s' % 'T2_CH_CERN', workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), wl = ['T2_US_Namibia']) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() # Jobs should be gone getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs") result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs * 2) result = getJobsAction.execute(state = 'SubmitFailed', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) # Run again with all sites blacklisted jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), site = ['se.%s' % x for x in sites], workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), bl = sites) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() # Jobs should go to submit failed getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs") result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs * 2) result = getJobsAction.execute(state = 'SubmitFailed', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs * 2) return def testE_SiteModesTest(self): """ _testE_SiteModesTest_ Test the behavior of the submitter in response to the different states of the sites """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 1 nJobs = 20 sites = ['T2_US_Florida', 'T2_TW_Taiwan', 'T3_CO_Uniandes', 'T1_US_FNAL'] for site in sites: self.setResourceThresholds(site, pendingSlots = 10, runningSlots = -1, tasks = ['Processing', 'Merge'], Processing = {'pendingSlots' : 10, 'runningSlots' :-1}, Merge = {'pendingSlots' : 10, 'runningSlots' :-1, 'priority' : 5}) myResourceControl = ResourceControl() myResourceControl.changeSiteState('T2_US_Florida', 'Draining') # First test that we prefer Normal over drain, and T1 over T2/T3 jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, site = ['se.%s' % x for x in sites], task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName)) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config = config) # Actually run it jobSubmitter.algorithm() getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs") result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) # All jobs should be at either FNAL, Taiwan or Uniandes. It's a random selection # Check assigned locations getLocationAction = self.daoFactory(classname = "Jobs.GetLocation") locationDict = getLocationAction.execute([{'jobid' : x} for x in result]) for entry in locationDict: loc = entry['site_name'] self.assertNotEqual(loc, 'T2_US_Florida') # Now set everything to down, check we don't submit anything for site in sites: myResourceControl.changeSiteState(site, 'Down') jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, site = ['se.%s' % x for x in sites], task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName)) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() # Nothing is submitted despite the empty slots at Uniandes and Florida result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) # Now set everything to Aborted, and create Merge jobs. Those should fail # since the can only run at one place for site in sites: myResourceControl.changeSiteState(site, 'Aborted') nSubsMerge = 1 nJobsMerge = 5 jobGroupList = self.createJobGroups(nSubs = nSubsMerge, nJobs = nJobsMerge, site = ['se.%s' % x for x in sites], task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), taskType = 'Merge') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() result = getJobsAction.execute(state = 'SubmitFailed', jobType = 'Merge') self.assertEqual(len(result), nSubsMerge * nJobsMerge) result = getJobsAction.execute(state = 'Executing', jobType = 'Processing') self.assertEqual(len(result), nSubs * nJobs) return @attr('performance') def testF_PollerProfileTest(self): """ _testF_PollerProfileTest_ Submit a lot of jobs and test how long it takes for them to actually be submitted """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 100 nJobs = 100 sites = ['T1_US_FNAL'] for site in sites: self.setResourceThresholds(site, pendingSlots = 20000, runningSlots = -1, tasks = ['Processing', 'Merge'], Processing = {'pendingSlots' : 10000, 'runningSlots' :-1}, Merge = {'pendingSlots' : 10000, 'runningSlots' :-1, 'priority' : 5}) # Always initialize the submitter after setting the sites, flaky! jobSubmitter = JobSubmitterPoller(config = config) jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % 'T1_US_FNAL') jobGroupList.extend(self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.%s' % 'T1_US_FNAL', taskType = 'Merge')) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Actually run it startTime = time.time() cProfile.runctx("jobSubmitter.algorithm()", globals(), locals(), filename = "testStats.stat") stopTime = time.time() print "Job took %f seconds to complete" % (stopTime - startTime) p = pstats.Stats('testStats.stat') p.sort_stats('cumulative') p.print_stats() return
class BossAirTest(unittest.TestCase): """ Tests for the BossAir prototype """ sites = ["T2_US_Florida", "T2_US_UCSD", "T2_TW_Taiwan", "T1_CH_CERN", "malpaquet"] def setUp(self): """ setup for test. """ myThread = threading.currentThread() self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() # self.tearDown() self.testInit.setSchema( customModules=["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"], useDefault=False, ) self.testInit.setupCouch("bossair_t/jobs", "JobDump") self.testInit.setupCouch("bossair_t/fwjrs", "FWJRDump") self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") locationAction = self.daoFactory(classname="Locations.New") locationSlots = self.daoFactory(classname="Locations.SetJobSlots") # Create sites in resourceControl resourceControl = ResourceControl() for site in self.sites: resourceControl.insertSite( siteName=site, seName="se.%s" % (site), ceName=site, plugin="CondorPlugin", jobSlots=1000 ) resourceControl.insertThreshold(siteName=site, taskType="Processing", maxSlots=1000) resourceControl.insertSite(siteName="Xanadu", seName="se.Xanadu", ceName="Xanadu", plugin="TestPlugin") resourceControl.insertThreshold(siteName="Xanadu", taskType="Processing", maxSlots=10000) resourceControl.insertSite( siteName="jade-cms.hip.fi", seName="madhatter.csc.fi", ceName="jade-cms.hip.fi", plugin="ARCPlugin" ) resourceControl.insertThreshold(siteName="jade-cms.hip.fi", taskType="Processing", maxSlots=100) # using this for glite submissions resourceControl.insertSite( siteName="grid-ce-01.ba.infn.it", seName="storm-se-01.ba.infn.it", ceName="grid-ce-01.ba.infn.it", plugin="gLitePlugin", ) resourceControl.insertThreshold(siteName="grid-ce-01.ba.infn.it", taskType="Processing", maxSlots=50) # Create user newuser = self.daoFactory(classname="Users.New") newuser.execute(dn="moron") # We actually need the user name self.user = getpass.getuser() self.testDir = self.testInit.generateWorkDir() # Set heartbeat componentName = "test" self.heartbeatAPI = HeartbeatAPI(componentName) self.heartbeatAPI.registerComponent() componentName = "JobTracker" self.heartbeatAPI2 = HeartbeatAPI(componentName) self.heartbeatAPI2.registerComponent() return def tearDown(self): """ Database deletion """ self.testInit.clearDatabase( modules=["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"] ) self.testInit.delWorkDir() self.testInit.tearDownCouch() return def getConfig(self): """ _getConfig_ Build a basic BossAir config """ config = Configuration() config.section_("Agent") config.Agent.agentName = "testAgent" config.Agent.componentName = "test" config.Agent.useHeartbeat = False config.section_("CoreDatabase") config.CoreDatabase.connectUrl = os.getenv("DATABASE") config.CoreDatabase.socket = os.getenv("DBSOCK") config.section_("BossAir") config.BossAir.pluginNames = ["TestPlugin", "CondorPlugin"] config.BossAir.pluginDir = "WMCore.BossAir.Plugins" config.component_("JobSubmitter") config.JobSubmitter.logLevel = "INFO" config.JobSubmitter.pollInterval = 1 config.JobSubmitter.pluginName = "AirPlugin" config.JobSubmitter.pluginDir = "JobSubmitter.Plugins" config.JobSubmitter.submitDir = os.path.join(self.testDir, "submit") config.JobSubmitter.submitNode = os.getenv("HOSTNAME", "badtest.fnal.gov") config.JobSubmitter.submitScript = os.path.join( WMCore.WMInit.getWMBASE(), "test/python/WMComponent_t/JobSubmitter_t", "submit.sh" ) config.JobSubmitter.componentDir = os.path.join(os.getcwd(), "Components") config.JobSubmitter.workerThreads = 2 config.JobSubmitter.jobsPerWorker = 200 config.JobSubmitter.gLiteConf = os.path.join(os.getcwd(), "config.cfg") # JobTracker config.component_("JobTracker") config.JobTracker.logLevel = "INFO" config.JobTracker.pollInterval = 1 # JobStateMachine config.component_("JobStateMachine") config.JobStateMachine.couchurl = os.getenv("COUCHURL") config.JobStateMachine.couchDBName = "bossair_t" # JobStatusLite config.component_("JobStatusLite") config.JobStatusLite.componentDir = os.path.join(os.getcwd(), "Components") config.JobStatusLite.stateTimeouts = {"Pending": 10, "Running": 86400} config.JobStatusLite.pollInterval = 1 return config def createTestWorkload(self, workloadName="Test", emulator=True): """ _createTestWorkload_ Creates a test workload for us to run on, hold the basic necessities. """ workload = testWorkload("Tier1ReReco") rereco = workload.getTask("ReReco") taskMaker = TaskMaker(workload, os.path.join(self.testDir, "workloadTest")) taskMaker.skipSubscription = True taskMaker.processWorkload() workload.save(workloadName) return workload def createJobGroups(self, nSubs, nJobs, task, workloadSpec, site=None, bl=[], wl=[]): """ Creates a series of jobGroups for submissions """ jobGroupList = [] testWorkflow = Workflow(spec=workloadSpec, owner="mnorman", name=makeUUID(), task="basicWorkload/Production") testWorkflow.create() # Create subscriptions for i in range(nSubs): name = makeUUID() # Create Fileset, Subscription, jobGroup testFileset = Fileset(name=name) testFileset.create() testSubscription = Subscription( fileset=testFileset, workflow=testWorkflow, type="Processing", split_algo="FileBased" ) testSubscription.create() testJobGroup = JobGroup(subscription=testSubscription) testJobGroup.create() # Create jobs self.makeNJobs( name=name, task=task, nJobs=nJobs, jobGroup=testJobGroup, fileset=testFileset, sub=testSubscription.exists(), site=site, bl=bl, wl=wl, ) testFileset.commit() testJobGroup.commit() jobGroupList.append(testJobGroup) return jobGroupList def makeNJobs(self, name, task, nJobs, jobGroup, fileset, sub, site=None, bl=[], wl=[]): """ _makeNJobs_ Make and return a WMBS Job and File This handles all those damn add-ons """ # Set the CacheDir cacheDir = os.path.join(self.testDir, "CacheDir") for n in range(nJobs): # First make a file # site = self.sites[0] testFile = File(lfn="/singleLfn/%s/%s" % (name, n), size=1024, events=10) if site: testFile.setLocation(site) else: for tmpSite in self.sites: testFile.setLocation("se.%s" % (tmpSite)) testFile.create() fileset.addFile(testFile) fileset.commit() index = 0 for f in fileset.files: index += 1 testJob = Job(name="%s-%i" % (name, index)) testJob.addFile(f) testJob["location"] = f.getLocations()[0] testJob["custom"]["location"] = f.getLocations()[0] testJob["task"] = task.getPathName() testJob["sandbox"] = task.data.input.sandbox testJob["spec"] = os.path.join(self.testDir, "basicWorkload.pcl") testJob["mask"]["FirstEvent"] = 101 testJob["owner"] = "mnorman" testJob["siteBlacklist"] = bl testJob["siteWhitelist"] = wl testJob["ownerDN"] = "mnorman" jobCache = os.path.join(cacheDir, "Sub_%i" % (sub), "Job_%i" % (index)) os.makedirs(jobCache) testJob.create(jobGroup) testJob["cache_dir"] = jobCache testJob.save() jobGroup.add(testJob) output = open(os.path.join(jobCache, "job.pkl"), "w") pickle.dump(testJob, output) output.close() return testJob, testFile def createDummyJobs(self, nJobs, location=None): """ _createDummyJobs_ Create some dummy jobs """ if not location: location = self.sites[0] nameStr = makeUUID() testWorkflow = Workflow(spec=nameStr, owner="mnorman", name=nameStr, task="basicWorkload/Production") testWorkflow.create() testFileset = Fileset(name=nameStr) testFileset.create() testSubscription = Subscription( fileset=testFileset, workflow=testWorkflow, type="Processing", split_algo="FileBased" ) testSubscription.create() testJobGroup = JobGroup(subscription=testSubscription) testJobGroup.create() jobList = [] for i in range(nJobs): testJob = Job(name="%s-%i" % (nameStr, i)) testJob["location"] = location testJob["custom"]["location"] = location testJob["userdn"] = "moron" testJob["owner"] = "moron" testJob.create(testJobGroup) jobList.append(testJob) return jobList @attr("integration") def testA_APITest(self): """ _APITest_ This is a commissioning test that has very little to do with anything except loading the code. """ # return myThread = threading.currentThread() config = self.getConfig() baAPI = BossAirAPI(config=config) # We should have loaded a plugin self.assertTrue("TestPlugin" in baAPI.plugins.keys()) result = myThread.dbi.processData("SELECT name FROM bl_status")[0].fetchall() statusList = [] for i in result: statusList.append(i.values()[0]) # We should have the plugin states in the database self.assertEqual(statusList.sort(), ["New", "Dead", "Gone"].sort()) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs) baAPI.createNewJobs(wmbsJobs=jobDummies) runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), nJobs) newJobs = baAPI._loadByStatus(status="New") self.assertEqual(len(newJobs), nJobs) deadJobs = baAPI._loadByStatus(status="Dead") self.assertEqual(len(deadJobs), 0) raisesException = False try: baAPI._loadByStatus(status="FalseStatus") except BossAirException: # It should raise an error if we try loading a # non-existant status raisesException = True self.assertTrue(raisesException) # Change the job status and update it for job in newJobs: job["status"] = "Dead" baAPI._updateJobs(jobs=newJobs) # Test whether we see the job status as updated newJobs = baAPI._loadByStatus(status="New") self.assertEqual(len(newJobs), 0) deadJobs = baAPI._loadByStatus(status="Dead") self.assertEqual(len(deadJobs), nJobs) # Can we load by BossAir ID? loadedJobs = baAPI._loadByID(jobs=deadJobs) self.assertEqual(len(loadedJobs), nJobs) # Can we load via WMBS? loadedJobs = baAPI.loadByWMBS(wmbsJobs=jobDummies) self.assertEqual(len(loadedJobs), nJobs) # See if we can delete jobs baAPI._deleteJobs(jobs=deadJobs) # Confirm that they're gone deadJobs = baAPI._loadByStatus(status="Dead") self.assertEqual(len(deadJobs), 0) self.assertEqual(len(baAPI.jobs), 0) return @attr("integration") def testB_PluginTest(self): """ _PluginTest_ Now check that these functions worked if called through plugins Instead of directly. There are only three plugin """ # return myThread = threading.currentThread() config = self.getConfig() baAPI = BossAirAPI(config=config) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs, location="Xanadu") changeState = ChangeState(config) changeState.propagate(jobDummies, "created", "new") changeState.propagate(jobDummies, "executing", "created") # Prior to building the job, each job must have a plugin # and user assigned for job in jobDummies: job["plugin"] = "TestPlugin" job["owner"] = "mnorman" baAPI.submit(jobs=jobDummies) newJobs = baAPI._loadByStatus(status="New") self.assertEqual(len(newJobs), nJobs) # Should be no more running jobs runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), nJobs) # Test Plugin should complete all jobs baAPI.track() # Should be no more running jobs runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), 0) # Check if they're complete completeJobs = baAPI.getComplete() self.assertEqual(len(completeJobs), nJobs) # Do this test because BossAir is specifically built # to keep it from finding completed jobs result = myThread.dbi.processData("SELECT id FROM bl_runjob")[0].fetchall() self.assertEqual(len(result), nJobs) baAPI.removeComplete(jobs=jobDummies) result = myThread.dbi.processData("SELECT id FROM bl_runjob")[0].fetchall() self.assertEqual(len(result), 0) return def testG_monitoringDAO(self): """ _monitoringDAO_ Because I need a test for the monitoring DAO """ return myThread = threading.currentThread() config = self.getConfig() changeState = ChangeState(config) baAPI = BossAirAPI(config=config) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs) # Prior to building the job, each job must have a plugin # and user assigned for job in jobDummies: job["plugin"] = "TestPlugin" job["owner"] = "mnorman" job["location"] = "T2_US_UCSD" job.save() baAPI.submit(jobs=jobDummies) results = baAPI.monitor() self.assertEqual(len(results), nJobs) for job in results: self.assertEqual(job["plugin"], "CondorPlugin") return
class BossAirTest(unittest.TestCase): """ Tests for the BossAir prototype """ sites = ['T2_US_UCSD', 'T2_TW_Taiwan', 'T1_CH_CERN', 'T2_US_Florida'] def setUp(self): """ setup for test. """ myThread = threading.currentThread() self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.tearDown() self.testInit.setSchema(customModules=[ "WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database" ], useDefault=False) self.testInit.setupCouch("bossair_t/jobs", "JobDump") self.testInit.setupCouch("bossair_t/fwjrs", "FWJRDump") self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") #Create sites in resourceControl resourceControl = ResourceControl() for site in self.sites: resourceControl.insertSite(siteName=site, pnn='se.%s' % (site), cmsName=site, ceName=site, plugin="CondorPlugin", pendingSlots=1000, runningSlots=2000) resourceControl.insertThreshold(siteName = site, taskType = 'Processing', \ maxSlots = 1000, pendingSlots = 1000) resourceControl.insertSite(siteName='Xanadu', pnn='se.Xanadu', cmsName=site, ceName='Xanadu', plugin="TestPlugin") resourceControl.insertThreshold(siteName = 'Xanadu', taskType = 'Processing', \ maxSlots = 10000, pendingSlots = 10000) resourceControl.insertSite(siteName='jade-cms.hip.fi', pnn='madhatter.csc.fi', cmsName=site, ceName='jade-cms.hip.fi', plugin="ARCPlugin") resourceControl.insertThreshold(siteName = 'jade-cms.hip.fi', taskType = 'Processing', \ maxSlots = 100, pendingSlots = 100) # using this for glite submissions resourceControl.insertSite(siteName='grid-ce-01.ba.infn.it', pnn='storm-se-01.ba.infn.it', cmsName=site, ceName='grid-ce-01.ba.infn.it', plugin='gLitePlugin') resourceControl.insertThreshold(siteName = 'grid-ce-01.ba.infn.it', taskType = 'Processing', \ maxSlots = 50, pendingSlots = 50) # Create user newuser = self.daoFactory(classname="Users.New") newuser.execute(dn="tapas", group_name="phgroup", role_name="cmsrole") # We actually need the user name self.user = getpass.getuser() # Change this to the working dir to keep track of error and log files from condor self.testDir = self.testInit.generateWorkDir() # Set heartbeat componentName = 'test' self.heartbeatAPI = HeartbeatAPI(componentName) self.heartbeatAPI.registerComponent() componentName = 'JobTracker' self.heartbeatAPI2 = HeartbeatAPI(componentName) self.heartbeatAPI2.registerComponent() return def tearDown(self): """ Database deletion """ #self.testInit.clearDatabase(modules = ["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"]) self.testInit.delWorkDir() self.testInit.tearDownCouch() return def getConfig(self): """ _getConfig_ Build a basic BossAir config """ config = self.testInit.getConfiguration() config.section_("Agent") config.Agent.agentName = 'testAgent' config.Agent.componentName = 'test' config.Agent.useHeartbeat = False config.section_("CoreDatabase") config.CoreDatabase.connectUrl = os.getenv("DATABASE") config.CoreDatabase.socket = os.getenv("DBSOCK") config.section_("BossAir") config.BossAir.pluginNames = ['TestPlugin', 'CondorPlugin'] config.BossAir.pluginDir = 'WMCore.BossAir.Plugins' config.BossAir.UISetupScript = '/afs/cern.ch/cms/LCG/LCG-2/UI/cms_ui_env.sh' config.component_("JobSubmitter") config.JobSubmitter.logLevel = 'INFO' config.JobSubmitter.pollInterval = 1 config.JobSubmitter.pluginName = 'AirPlugin' config.JobSubmitter.pluginDir = 'JobSubmitter.Plugins' config.JobSubmitter.submitDir = os.path.join(self.testDir, 'submit') config.JobSubmitter.submitNode = os.getenv("HOSTNAME", 'stevia.hep.wisc.edu') config.JobSubmitter.submitScript = os.path.join( WMCore.WMInit.getWMBASE(), 'test/python/WMComponent_t/JobSubmitter_t', 'submit.sh') config.JobSubmitter.componentDir = os.path.join( os.getcwd(), 'Components') config.JobSubmitter.workerThreads = 2 config.JobSubmitter.jobsPerWorker = 200 config.JobSubmitter.gLiteConf = os.path.join(os.getcwd(), 'config.cfg') # JobTracker config.component_("JobTracker") config.JobTracker.logLevel = 'INFO' config.JobTracker.pollInterval = 1 # JobStateMachine config.component_('JobStateMachine') config.JobStateMachine.couchurl = os.getenv('COUCHURL') config.JobStateMachine.couchDBName = "bossair_t" # JobStatusLite config.component_('JobStatusLite') config.JobStatusLite.componentDir = os.path.join( os.getcwd(), 'Components') config.JobStatusLite.stateTimeouts = {'Pending': 10, 'Running': 86400} config.JobStatusLite.pollInterval = 1 return config def createTestWorkload(self, workloadName='Test', emulator=True): """ _createTestWorkload_ Creates a test workload for us to run on, hold the basic necessities. """ workload = testWorkload("Tier1ReReco") rereco = workload.getTask("ReReco") taskMaker = TaskMaker(workload, os.path.join(self.testDir, 'workloadTest')) taskMaker.skipSubscription = True taskMaker.processWorkload() workload.save(workloadName) return workload def createJobGroups(self, nSubs, nJobs, task, workloadSpec, site=None, bl=[], wl=[]): """ Creates a series of jobGroups for submissions """ jobGroupList = [] testWorkflow = Workflow(spec=workloadSpec, owner="tapas", name=makeUUID(), task="basicWorkload/Production", owner_vogroup='phgroup', owner_vorole='cmsrole') testWorkflow.create() # Create subscriptions for i in range(nSubs): name = makeUUID() # Create Fileset, Subscription, jobGroup testFileset = Fileset(name=name) testFileset.create() testSubscription = Subscription(fileset=testFileset, workflow=testWorkflow, type="Processing", split_algo="FileBased") testSubscription.create() testJobGroup = JobGroup(subscription=testSubscription) testJobGroup.create() # Create jobs self.makeNJobs(name=name, task=task, nJobs=nJobs, jobGroup=testJobGroup, fileset=testFileset, sub=testSubscription.exists(), site=site, bl=bl, wl=wl) testFileset.commit() testJobGroup.commit() jobGroupList.append(testJobGroup) return jobGroupList def makeNJobs(self, name, task, nJobs, jobGroup, fileset, sub, site=None, bl=[], wl=[]): """ _makeNJobs_ Make and return a WMBS Job and File This handles all those damn add-ons """ # Set the CacheDir cacheDir = os.path.join(self.testDir, 'CacheDir') for n in range(nJobs): # First make a file #site = self.sites[0] testFile = File(lfn="/singleLfn/%s/%s" % (name, n), size=1024, events=10) if site: testFile.setLocation(site) else: for tmpSite in self.sites: testFile.setLocation('se.%s' % (tmpSite)) testFile.create() fileset.addFile(testFile) fileset.commit() index = 0 for f in fileset.files: index += 1 testJob = Job(name='%s-%i' % (name, index)) testJob.addFile(f) testJob["location"] = f.getLocations()[0] testJob['custom']['location'] = f.getLocations()[0] testJob['task'] = task.getPathName() testJob['sandbox'] = task.data.input.sandbox testJob['spec'] = os.path.join(self.testDir, 'basicWorkload.pcl') testJob['mask']['FirstEvent'] = 101 testJob['owner'] = 'tapas' testJob["siteBlacklist"] = bl testJob["siteWhitelist"] = wl testJob['ownerDN'] = 'tapas' testJob['ownerRole'] = 'cmsrole' testJob['ownerGroup'] = 'phgroup' jobCache = os.path.join(cacheDir, 'Sub_%i' % (sub), 'Job_%i' % (index)) os.makedirs(jobCache) testJob.create(jobGroup) testJob['cache_dir'] = jobCache testJob.save() jobGroup.add(testJob) output = open(os.path.join(jobCache, 'job.pkl'), 'w') pickle.dump(testJob, output) output.close() return testJob, testFile def createDummyJobs(self, nJobs, location=None): """ _createDummyJobs_ Create some dummy jobs """ if not location: location = self.sites[0] nameStr = makeUUID() testWorkflow = Workflow(spec=nameStr, owner="tapas", name=nameStr, task="basicWorkload/Production", owner_vogroup='phgroup', owner_vorole='cmsrole') testWorkflow.create() testFileset = Fileset(name=nameStr) testFileset.create() testSubscription = Subscription(fileset=testFileset, workflow=testWorkflow, type="Processing", split_algo="FileBased") testSubscription.create() testJobGroup = JobGroup(subscription=testSubscription) testJobGroup.create() jobList = [] for i in range(nJobs): testJob = Job(name='%s-%i' % (nameStr, i)) testJob['location'] = location testJob['custom']['location'] = location testJob['userdn'] = 'tapas' testJob['owner'] = 'tapas' testJob['userrole'] = 'cmsrole' testJob['usergroup'] = 'phgroup' testJob.create(testJobGroup) jobList.append(testJob) return jobList @attr('integration') def testA_APITest(self): """ _APITest_ This is a commissioning test that has very little to do with anything except loading the code. """ #return myThread = threading.currentThread() config = self.getConfig() baAPI = BossAirAPI(config=config) # We should have loaded a plugin self.assertTrue('TestPlugin' in baAPI.plugins.keys()) result = myThread.dbi.processData( "SELECT name FROM bl_status")[0].fetchall() statusList = [] for i in result: statusList.append(i.values()[0]) # We should have the plugin states in the database self.assertEqual(statusList.sort(), ['New', 'Dead', 'Gone'].sort()) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs) print(jobDummies) baAPI.createNewJobs(wmbsJobs=jobDummies) runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), nJobs) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), nJobs) deadJobs = baAPI._loadByStatus(status='Dead') self.assertEqual(len(deadJobs), 0) raisesException = False self.assertRaises(BossAirException, baAPI._loadByStatus, status='FalseStatus') # Change the job status and update it for job in newJobs: job['status'] = 'Dead' baAPI._updateJobs(jobs=newJobs) # Test whether we see the job status as updated newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), 0) deadJobs = baAPI._loadByStatus(status='Dead') self.assertEqual(len(deadJobs), nJobs) # Can we load by BossAir ID? loadedJobs = baAPI._loadByID(jobs=deadJobs) self.assertEqual(len(loadedJobs), nJobs) # Can we load via WMBS? loadedJobs = baAPI.loadByWMBS(wmbsJobs=jobDummies) self.assertEqual(len(loadedJobs), nJobs) # See if we can delete jobs baAPI._deleteJobs(jobs=deadJobs) # Confirm that they're gone deadJobs = baAPI._loadByStatus(status='Dead') self.assertEqual(len(deadJobs), 0) self.assertEqual(len(baAPI.jobs), 0) return @attr('integration') def testB_PluginTest(self): """ _PluginTest_ Now check that these functions worked if called through plugins Instead of directly. There are only three plugin """ #return myThread = threading.currentThread() config = self.getConfig() baAPI = BossAirAPI(config=config) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs, location='Xanadu') changeState = ChangeState(config) changeState.propagate(jobDummies, 'created', 'new') changeState.propagate(jobDummies, 'executing', 'created') # Prior to building the job, each job must have a plugin # and user assigned for job in jobDummies: job['plugin'] = 'TestPlugin' job['owner'] = 'tapas' baAPI.submit(jobs=jobDummies) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), nJobs) # Should be no more running jobs runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), nJobs) # Test Plugin should complete all jobs baAPI.track() # Should be no more running jobs runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), 0) # Check if they're complete completeJobs = baAPI.getComplete() self.assertEqual(len(completeJobs), nJobs) # Do this test because BossAir is specifically built # to keep it from finding completed jobs result = myThread.dbi.processData( "SELECT id FROM bl_runjob")[0].fetchall() self.assertEqual(len(result), nJobs) baAPI.removeComplete(jobs=jobDummies) result = myThread.dbi.processData( "SELECT id FROM bl_runjob")[0].fetchall() self.assertEqual(len(result), 0) return def testG_monitoringDAO(self): """ _monitoringDAO_ Because I need a test for the monitoring DAO """ return myThread = threading.currentThread() config = self.getConfig() changeState = ChangeState(config) baAPI = BossAirAPI(config=config) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs) # Prior to building the job, each job must have a plugin # and user assigned for job in jobDummies: job['plugin'] = 'TestPlugin' job['owner'] = 'tapas' job['location'] = 'T2_US_UCSD' job.save() baAPI.submit(jobs=jobDummies) results = baAPI.monitor() self.assertEqual(len(results), nJobs) for job in results: self.assertEqual(job['plugin'], 'CondorPlugin') return
class JobCreatorTest(unittest.TestCase): """ Test case for the JobCreator """ sites = ['T2_US_Florida', 'T2_US_UCSD', 'T2_TW_Taiwan', 'T1_CH_CERN'] def setUp(self): """ _setUp_ Setup the database and logging connection. Try to create all of the WMBS tables. Also, create some dummy locations. """ self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules=['WMCore.WMBS', 'WMCore.ResourceControl', 'WMCore.Agent.Database'], useDefault=False) self.couchdbname = "jobcreator_t" self.testInit.setupCouch("%s/jobs" % self.couchdbname, "JobDump") self.testInit.setupCouch("%s/fwjrs" % self.couchdbname, "FWJRDump") self.configFile = EmulatorSetup.setupWMAgentConfig() myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) locationAction = self.daoFactory(classname="Locations.New") for site in self.sites: locationAction.execute(siteName=site, pnn=site) # Create sites in resourceControl resourceControl = ResourceControl() for site in self.sites: resourceControl.insertSite(siteName=site, pnn=site, ceName=site) resourceControl.insertThreshold(siteName=site, taskType='Processing', maxSlots=10000, pendingSlots=10000) self.resourceControl = resourceControl self._setup = True self._teardown = False self.testDir = self.testInit.generateWorkDir() self.cwd = os.getcwd() # Set heartbeat self.componentName = 'JobCreator' self.heartbeatAPI = HeartbeatAPI(self.componentName) self.heartbeatAPI.registerComponent() return def tearDown(self): """ _tearDown_ Drop all the WMBS tables. """ self.testInit.clearDatabase(modules=['WMCore.WMBS', 'WMCore.ResourceControl', 'WMCore.Agent.Database']) self.testInit.delWorkDir() self._teardown = True self.testInit.tearDownCouch() EmulatorSetup.deleteConfig(self.configFile) return def createJobCollection(self, name, nSubs, nFiles, workflowURL='test'): """ _createJobCollection_ Create a collection of jobs """ myThread = threading.currentThread() testWorkflow = Workflow(spec=workflowURL, owner="mnorman", name=name, task="/TestWorkload/ReReco") testWorkflow.create() for sub in range(nSubs): nameStr = '%s-%i' % (name, sub) myThread.transaction.begin() testFileset = Fileset(name=nameStr) testFileset.create() for f in range(nFiles): # pick a random site site = random.choice(self.sites) testFile = File(lfn="/lfn/%s/%i" % (nameStr, f), size=1024, events=10) testFile.setLocation(site) testFile.create() testFileset.addFile(testFile) testFileset.commit() testSubscription = Subscription(fileset=testFileset, workflow=testWorkflow, type="Processing", split_algo="FileBased") testSubscription.create() myThread.transaction.commit() return def createWorkload(self, workloadName='Test', emulator=True, priority=1): """ _createTestWorkload_ Creates a test workload for us to run on, hold the basic necessities. """ workload = testWorkload("Tier1ReReco") rereco = workload.getTask("ReReco") seederDict = {"generator.initialSeed": 1001, "evtgenproducer.initialSeed": 1001} rereco.addGenerator("PresetSeeder", **seederDict) taskMaker = TaskMaker(workload, os.path.join(self.testDir, 'workloadTest')) taskMaker.skipSubscription = True taskMaker.processWorkload() return workload def getConfig(self): """ _getConfig_ Creates a common config. """ config = self.testInit.getConfiguration() self.testInit.generateWorkDir(config) # First the general stuff config.section_("General") config.General.workDir = os.getenv("TESTDIR", os.getcwd()) config.section_("Agent") config.Agent.componentName = self.componentName # Now the CoreDatabase information # This should be the dialect, dburl, etc config.section_("CoreDatabase") config.CoreDatabase.connectUrl = os.getenv("DATABASE") config.CoreDatabase.socket = os.getenv("DBSOCK") config.component_("JobCreator") config.JobCreator.namespace = 'WMComponent.JobCreator.JobCreator' # The log level of the component. # config.JobCreator.logLevel = 'SQLDEBUG' config.JobCreator.logLevel = 'INFO' # maximum number of threads we want to deal # with messages per pool. config.JobCreator.maxThreads = 1 config.JobCreator.UpdateFromResourceControl = True config.JobCreator.pollInterval = 10 # config.JobCreator.jobCacheDir = self.testDir config.JobCreator.defaultJobType = 'processing' # Type of jobs that we run, used for resource control config.JobCreator.workerThreads = 4 config.JobCreator.componentDir = self.testDir config.JobCreator.useWorkQueue = True config.JobCreator.WorkQueueParams = {'emulateDBSReader': True} # We now call the JobMaker from here config.component_('JobMaker') config.JobMaker.logLevel = 'INFO' config.JobMaker.namespace = 'WMCore.WMSpec.Makers.JobMaker' config.JobMaker.maxThreads = 1 config.JobMaker.makeJobsHandler = 'WMCore.WMSpec.Makers.Handlers.MakeJobs' # JobStateMachine config.component_('JobStateMachine') config.JobStateMachine.couchurl = os.getenv('COUCHURL', 'cmssrv52.fnal.gov:5984') config.JobStateMachine.couchDBName = self.couchdbname return config def testVerySimpleTest(self): """ _VerySimpleTest_ Just test that everything works...more or less """ # return myThread = threading.currentThread() config = self.getConfig() name = makeUUID() nSubs = 5 nFiles = 10 workloadName = 'TestWorkload' dummyWorkload = self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') self.createJobCollection(name=name, nSubs=nSubs, nFiles=nFiles, workflowURL=workloadPath) testJobCreator = JobCreatorPoller(config=config) # First, can we run once without everything crashing? testJobCreator.algorithm() getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), nSubs * nFiles) # Count database objects result = myThread.dbi.processData('SELECT * FROM wmbs_sub_files_acquired')[0].fetchall() self.assertEqual(len(result), nSubs * nFiles) # Find the test directory testDirectory = os.path.join(self.testDir, 'jobCacheDir', 'TestWorkload', 'ReReco') # It should have at least one jobGroup self.assertTrue('JobCollection_1_0' in os.listdir(testDirectory)) # But no more then twenty self.assertTrue(len(os.listdir(testDirectory)) <= 20) groupDirectory = os.path.join(testDirectory, 'JobCollection_1_0') # First job should be in here listOfDirs = [] for tmpDirectory in os.listdir(testDirectory): listOfDirs.extend(os.listdir(os.path.join(testDirectory, tmpDirectory))) self.assertTrue('job_1' in listOfDirs) self.assertTrue('job_2' in listOfDirs) self.assertTrue('job_3' in listOfDirs) jobDir = os.listdir(groupDirectory)[0] jobFile = os.path.join(groupDirectory, jobDir, 'job.pkl') self.assertTrue(os.path.isfile(jobFile)) f = open(jobFile, 'r') job = pickle.load(f) f.close() self.assertEqual(job.baggage.PresetSeeder.generator.initialSeed, 1001) self.assertEqual(job.baggage.PresetSeeder.evtgenproducer.initialSeed, 1001) self.assertEqual(job['workflow'], name) self.assertEqual(len(job['input_files']), 1) self.assertEqual(os.path.basename(job['sandbox']), 'TestWorkload-Sandbox.tar.bz2') return @attr('performance', 'integration') def testProfilePoller(self): """ Profile your performance You shouldn't be running this normally because it doesn't do anything """ myThread = threading.currentThread() name = makeUUID() nSubs = 5 nFiles = 1500 workloadName = 'TestWorkload' workload = self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') self.createJobCollection(name=name, nSubs=nSubs, nFiles=nFiles, workflowURL=workloadPath) config = self.getConfig() testJobCreator = JobCreatorPoller(config=config) cProfile.runctx("testJobCreator.algorithm()", globals(), locals(), filename="testStats.stat") getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Created', jobType="Processing") time.sleep(10) self.assertEqual(len(result), nSubs * nFiles) p = pstats.Stats('testStats.stat') p.sort_stats('cumulative') p.print_stats(.2) return @attr('integration') def testProfileWorker(self): """ Profile where the work actually gets done You shouldn't be running this one either, since it doesn't test anything. """ myThread = threading.currentThread() name = makeUUID() nSubs = 5 nFiles = 500 workloadName = 'TestWorkload' workload = self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') self.createJobCollection(name=name, nSubs=nSubs, nFiles=nFiles, workflowURL=workloadPath) config = self.getConfig() configDict = {"couchURL": config.JobStateMachine.couchurl, "couchDBName": config.JobStateMachine.couchDBName, 'jobCacheDir': config.JobCreator.jobCacheDir, 'defaultJobType': config.JobCreator.defaultJobType} subs = [{"subscription": 1}, {"subscription": 2}, {"subscription": 3}, {"subscription": 4}, {"subscription": 5}] testJobCreator = JobCreatorPoller(**configDict) cProfile.runctx("testJobCreator.algorithm(parameters = input)", globals(), locals(), filename="workStats.stat") p = pstats.Stats('workStats.stat') p.sort_stats('cumulative') p.print_stats(.2) return @attr('integration') def testHugeTest(self): """ Don't run this one either """ myThread = threading.currentThread() config = self.getConfig() name = makeUUID() nSubs = 10 nFiles = 5000 workloadName = 'Tier1ReReco' dummyWorkload = self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') self.createJobCollection(name=name, nSubs=nSubs, nFiles=nFiles, workflowURL=workloadPath) testJobCreator = JobCreatorPoller(config=config) # First, can we run once without everything crashing? startTime = time.time() testJobCreator.algorithm() stopTime = time.time() getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), nSubs * nFiles) print("Job took %f seconds to run" % (stopTime - startTime)) # Count database objects result = myThread.dbi.processData('SELECT * FROM wmbs_sub_files_acquired')[0].fetchall() self.assertEqual(len(result), nSubs * nFiles) return def stuffWMBS(self, workflowURL, name): """ _stuffWMBS_ Insert some dummy jobs, jobgroups, filesets, files and subscriptions into WMBS to test job creation. Three completed job groups each containing several files are injected. Another incomplete job group is also injected. Also files are added to the "Mergeable" subscription as well as to the output fileset for their jobgroups. """ locationAction = self.daoFactory(classname="Locations.New") locationAction.execute(siteName="s1", pnn="somese.cern.ch") mergeFileset = Fileset(name="mergeFileset") mergeFileset.create() bogusFileset = Fileset(name="bogusFileset") bogusFileset.create() mergeWorkflow = Workflow(spec=workflowURL, owner="mnorman", name=name, task="/TestWorkload/ReReco") mergeWorkflow.create() mergeSubscription = Subscription(fileset=mergeFileset, workflow=mergeWorkflow, split_algo="ParentlessMergeBySize") mergeSubscription.create() dummySubscription = Subscription(fileset=bogusFileset, workflow=mergeWorkflow, split_algo="ParentlessMergeBySize") file1 = File(lfn="file1", size=1024, events=1024, first_event=0, locations={"somese.cern.ch"}) file1.addRun(Run(1, *[45])) file1.create() file2 = File(lfn="file2", size=1024, events=1024, first_event=1024, locations={"somese.cern.ch"}) file2.addRun(Run(1, *[45])) file2.create() file3 = File(lfn="file3", size=1024, events=1024, first_event=2048, locations={"somese.cern.ch"}) file3.addRun(Run(1, *[45])) file3.create() file4 = File(lfn="file4", size=1024, events=1024, first_event=3072, locations={"somese.cern.ch"}) file4.addRun(Run(1, *[45])) file4.create() fileA = File(lfn="fileA", size=1024, events=1024, first_event=0, locations={"somese.cern.ch"}) fileA.addRun(Run(1, *[46])) fileA.create() fileB = File(lfn="fileB", size=1024, events=1024, first_event=1024, locations={"somese.cern.ch"}) fileB.addRun(Run(1, *[46])) fileB.create() fileC = File(lfn="fileC", size=1024, events=1024, first_event=2048, locations={"somese.cern.ch"}) fileC.addRun(Run(1, *[46])) fileC.create() fileI = File(lfn="fileI", size=1024, events=1024, first_event=0, locations={"somese.cern.ch"}) fileI.addRun(Run(2, *[46])) fileI.create() fileII = File(lfn="fileII", size=1024, events=1024, first_event=1024, locations={"somese.cern.ch"}) fileII.addRun(Run(2, *[46])) fileII.create() fileIII = File(lfn="fileIII", size=1024, events=1024, first_event=2048, locations={"somese.cern.ch"}) fileIII.addRun(Run(2, *[46])) fileIII.create() fileIV = File(lfn="fileIV", size=1024 * 1000000, events=1024, first_event=3072, locations={"somese.cern.ch"}) fileIV.addRun(Run(2, *[46])) fileIV.create() for fileObj in [file1, file2, file3, file4, fileA, fileB, fileC, fileI, fileII, fileIII, fileIV]: mergeFileset.addFile(fileObj) bogusFileset.addFile(fileObj) mergeFileset.commit() bogusFileset.commit() return def testTestNonProxySplitting(self): """ _TestNonProxySplitting_ Test and see if we can split things without a proxy. """ config = self.getConfig() config.JobCreator.workerThreads = 1 name = makeUUID() workloadName = 'TestWorkload' workload = self.createWorkload(workloadName=workloadName) workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload', 'WMSandbox', 'WMWorkload.pkl') self.stuffWMBS(workflowURL=workloadPath, name=name) testJobCreator = JobCreatorPoller(config=config) testJobCreator.algorithm() getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 1) result = getJobsAction.execute(state='Created', jobType="Merge") self.assertEqual(len(result), 0) return
def setUp(self): """ setup for test. """ myThread = threading.currentThread() self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() #self.tearDown() self.testInit.setSchema(customModules = ["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"], useDefault = False) self.testInit.setupCouch("bossair_t/jobs", "JobDump") self.testInit.setupCouch("bossair_t/fwjrs", "FWJRDump") self.daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) self.getJobs = self.daoFactory(classname = "Jobs.GetAllJobs") locationAction = self.daoFactory(classname = "Locations.New") locationSlots = self.daoFactory(classname = "Locations.SetJobSlots") #Create sites in resourceControl resourceControl = ResourceControl() for site in self.sites: resourceControl.insertSite(siteName = site, seName = 'se.%s' % (site), ceName = site, plugin = "CondorPlugin", jobSlots = 1000) resourceControl.insertThreshold(siteName = site, taskType = 'Processing', \ maxSlots = 1000) resourceControl.insertSite(siteName = 'Xanadu', seName = 'se.Xanadu', ceName = 'Xanadu', plugin = "TestPlugin") resourceControl.insertThreshold(siteName = 'Xanadu', taskType = 'Processing', \ maxSlots = 10000) resourceControl.insertSite(siteName = 'jade-cms.hip.fi', seName = 'madhatter.csc.fi', ceName = 'jade-cms.hip.fi', plugin = "ARCPlugin") resourceControl.insertThreshold(siteName = 'jade-cms.hip.fi', taskType = 'Processing', \ maxSlots = 100) # using this for glite submissions resourceControl.insertSite(siteName = 'grid-ce-01.ba.infn.it', seName = 'storm-se-01.ba.infn.it', ceName = 'grid-ce-01.ba.infn.it', plugin = 'gLitePlugin') resourceControl.insertThreshold(siteName = 'grid-ce-01.ba.infn.it', taskType = 'Processing', \ maxSlots = 50) # Create user newuser = self.daoFactory(classname = "Users.New") newuser.execute(dn = "mnorman", group_name = "phgroup", role_name = "cmsrole") # We actually need the user name self.user = getpass.getuser() self.testDir = self.testInit.generateWorkDir() # Set heartbeat componentName = 'test' self.heartbeatAPI = HeartbeatAPI(componentName) self.heartbeatAPI.registerComponent() componentName = 'JobTracker' self.heartbeatAPI2 = HeartbeatAPI(componentName) self.heartbeatAPI2.registerComponent() return
class Harness(object): """ Harness class that wraps standard functionality used in all daemon components """ def __init__(self, config, compName=None): """ init The constructor is empty as we have an initalization method that can be called inside new threads (we use thread local attributes at startup. Default intialization of the harness including setting some diagnostic messages """ self.config = config # component name is always the class name of child class if not compName: compName = self.__class__.__name__ if compName not in (self.config.listComponents_() + self.config.listWebapps_()): raise WMException(WMEXCEPTION['WMCORE-8'] + compName, 'WMCORE-8') if not hasattr(self.config, "Agent"): self.config.section_("Agent") self.config.Agent.componentName = compName compSect = getattr(self.config, compName, None) if compSect is None: # Then we have a major problem - there's no section with this name logging.error("Could not find section %s in config", compName) logging.error( "We are returning, and hoping you know what you're doing!") logging.debug("Config: %s", self.config) return # check if componentDir is set if not assign. if getattr(compSect, 'componentDir', None) is None: if not hasattr(self.config, "General"): # Don't do anything. Assume the user knows what they are doing. logging.error( "Missing componentDir and General section in config") logging.error("Going to trust you to know what you're doing.") return compSect.componentDir = os.path.join( self.config.General.workDir, 'Components', self.config.Agent.componentName) # we have name and location of the log files. Now make sure there # is a directory. try: if not os.path.isdir(compSect.componentDir): os.makedirs(compSect.componentDir) except Exception as ex: logging.error( "Encountered exception while making componentDirs: %s", str(ex)) logging.error("Ignoring") self.threadManagerName = '' self.heartbeatAPI = None self.messages = {} self.logMsg = {} return def initInThread(self): """ Default intialization of the harness including setting some diagnostic messages. This method is called when we call 'prepareToStart' """ try: self.messages = {} compName = self.config.Agent.componentName compSect = getattr(self.config, compName, None) if not hasattr(compSect, "logFile"): if not getattr(compSect, 'componentDir', None): errorMessage = "No componentDir for log entries found!\n" errorMessage += "Harness cannot run without componentDir.\n" logging.error(errorMessage) raise HarnessException(errorMessage) compSect.logFile = os.path.join(compSect.componentDir, "ComponentLog") print('Log file is: ' + compSect.logFile) logHandler = RotatingFileHandler(compSect.logFile, "a", 1000000000, 3) logMsgFormat = getattr( compSect, "logMsgFormat", "%(asctime)s:%(thread)d:%(levelname)s:%(module)s:%(message)s") logFormatter = \ logging.Formatter(logMsgFormat) logHandler.setFormatter(logFormatter) logLevelName = getattr(compSect, 'logLevel', 'INFO') logLevel = getattr(logging, logLevelName) logging.getLogger().addHandler(logHandler) logging.getLogger().setLevel(logLevel) self.logMsg = { 'DEBUG': logging.DEBUG, 'ERROR': logging.ERROR, 'NOTSET': logging.NOTSET, 'CRITICAL': logging.CRITICAL, 'WARNING': logging.WARNING, 'INFO': logging.INFO, 'SQLDEBUG': logging.SQLDEBUG } if hasattr(compSect, "logLevel") and compSect.logLevel in self.logMsg: logging.getLogger().setLevel(self.logMsg[compSect.logLevel]) WMLogging.sqldebug("wmcore level debug:") # If not previously set, force wmcore cache to current path if not os.environ.get('WMCORE_CACHE_DIR'): os.environ['WMCORE_CACHE_DIR'] = os.path.join( compSect.componentDir, '.wmcore_cache') logging.info(">>>Starting: " + compName + '<<<') # check which backend to use: MySQL, Oracle, etc... for core # services. # we recognize there can be more than one database. # be we offer a default database that is used for core services. logging.info(">>>Initializing default database") logging.info(">>>Check if connection is through socket") myThread = threading.currentThread() myThread.logger = logging.getLogger() logging.info(">>>Setting config for thread: ") myThread.config = self.config logging.info(">>>Building database connection string") # check if there is a premade string if not build it yourself. dbConfig = ConfigDBMap(self.config) dbStr = dbConfig.getDBUrl() options = dbConfig.getOption() # we only want one DBFactory per database so we will need to # to pass this on in case we are using threads. myThread.dbFactory = DBFactory(myThread.logger, dbStr, options) myThread.sql_transaction = True if myThread.dbFactory.engine: myThread.dbi = myThread.dbFactory.connect() myThread.transaction = Transaction(myThread.dbi) else: myThread.dbi = myThread.config.CoreDatabase.connectUrl myThread.sql_transaction = False # Attach a worker manager object to the main thread if not hasattr(myThread, 'workerThreadManager'): myThread.workerThreadManager = WorkerThreadManager(self) else: myThread.workerThreadManager.terminateSlaves.clear() myThread.workerThreadManager.pauseWorkers() logging.info(">>>Initialize transaction dictionary") (connectDialect, dummy) = dbStr.split(":", 1) if connectDialect.lower() == 'mysql': myThread.dialect = 'MySQL' elif connectDialect.lower() == 'oracle': myThread.dialect = 'Oracle' logging.info("Harness part constructor finished") except Exception as ex: logging.critical("Problem instantiating " + str(ex)) logging.error("Traceback: %s", str(traceback.format_exc())) raise def preInitialization(self): """ _preInitialization_ returns: nothing method that can be overloaded and will be called before the start component is called. (enables you to set message->handler mappings). You use the self.message dictionary of the base class to define the mappings. """ pass def postInitialization(self): """ _postInitialization_ returns: nothing method that can be overloaded and will be called after the start component does the standard initialization, but before the wait (enables you to publish events when starting up) Define actions you want to execute before the actual message handling starts. E.g.: publishing some messages, or removing messages. """ pass def logState(self): """ _logState_ returns: string method that can be overloaded to log additional state information (should return atring) """ msg = 'No additional state information for ' + \ self.config.Agent.componentName return msg def publishItem(self, items): """ _publishItem_ returns: nothing A method that publishes a (dictionary) set or 1 item to a monitoring service. """ # FIXME: do we need this method. If so we need to agree # FIXME: on some default monitoring publication mechanism. pass def __call__(self, event, payload): """ Once upon a time this was for doing the handling of diagnostic messages With the test-deprecating of the MsgService based diagnostics, we've basically scratched this. I'm leaving this in so at least the framework is still there -mnorman """ return def initialization(self): """ _initialization__ Used the handle initializing the MsgService. The MsgService is no longer used. Removed but not deleted, since all sorts of things call it """ return def prepareToStart(self): """ _prepareToStart_ returns: Nothing Starts the initialization procedure. It is mainly an aggregation method so it can easily used in tests. """ self.state = 'initialize' self.initInThread() # note: every component gets a (unique) name: # self.config.Agent.componentName logging.info(">>>Registering Component - %s", self.config.Agent.componentName) if getattr(self.config.Agent, "useHeartbeat", True): self.heartbeatAPI = HeartbeatAPI(self.config.Agent.componentName) self.heartbeatAPI.registerComponent() logging.info('>>>Starting initialization') logging.info('>>>Setting default transaction') myThread = threading.currentThread() self.preInitialization() if myThread.sql_transaction: myThread.transaction.begin() self.initialization() self.postInitialization() if myThread.sql_transaction: myThread.transaction.commit() logging.info('>>>Committing default transaction') logging.info(">>>Starting worker threads") myThread.workerThreadManager.resumeWorkers() logging.info(">>>Initialization finished!\n") # wait for messages self.state = 'active' def prepareToStop(self, wait=False, stopPayload=""): """ _stopComponent Stops the component, including all worker threads. Allows call from test framework """ # Stop all worker threads logging.info(">>>Terminating worker threads") myThread = threading.currentThread() try: myThread.workerThreadManager.terminateWorkers() except Exception: # We may not have a thread manager pass if wait: logging.info( ">>>Shut down of component while waiting for threads to finish" ) # check if nr of threads is specified. activeThreads = 1 if stopPayload != "": activeThreads = int(stopPayload) if activeThreads < 1: activeThreads = 1 while threading.activeCount() > activeThreads: logging.info('>>>Currently ' + str(threading.activeCount()) + ' threads active') logging.info('>>>Waiting for less than ' + str(activeThreads) + ' to be active') time.sleep(5) def handleMessage(self, type='', payload=''): """ __handleMessage_ Formerly used to handle messages - now non-functional Left here in case someone else is using it (i.e. PilotManager) """ return def startDaemon(self, keepParent=False, compName=None): """ Same result as start component, except that the comopnent is started as a daemon, after which you can close your xterm and the process will still run. The keepParent option enables us to keep the parent process which is used during testing, """ msg = "Starting %s as a daemon " % self.config.Agent.componentName print(msg) if not compName: compName = self.__class__.__name__ compSect = getattr(self.config, compName, None) msg = "Log will be in %s " % compSect.componentDir print(msg) # put the daemon config file in the work dir of this component. # FIXME: this file will be replaced by a database table. compSect = getattr(self.config, self.config.Agent.componentName, None) pid = createDaemon(compSect.componentDir, keepParent) # if this is not the parent start the component if pid == 0: self.startComponent() # if this is the parent return control to the testing environment. def startComponent(self): """ _startComponent_ returns: Nothing Start up the component, performs initialization and waits indefinitely Calling this method results in the application running in the xterm (not in daemon mode) """ myThread = threading.currentThread() try: msg = 'None' self.prepareToStart() while True: time.sleep(360) except Exception as ex: if self.state == 'initialize': errormsg = """PostMortem: choked when initializing with error: %s\n""" % ( str(ex)) stackTrace = traceback.format_tb(sys.exc_info()[2], None) for stackFrame in stackTrace: errormsg += stackFrame else: errormsg = "" stackTrace = traceback.format_tb(sys.exc_info()[2], None) for stackFrame in stackTrace: errormsg += stackFrame logging.error(errormsg) logging.error( ">>>Fatal Error, Preparing to Rollback Transaction") if getattr(myThread, 'transaction', None) is not None: myThread.transaction.rollback() self.prepareToStop(False) errormsg = """ PostMortem: choked while handling messages with error: %s while trying to handle msg: %s """ % (str(ex), str(msg)) print(errormsg) logging.critical(errormsg) raise logging.info("System shutdown complete!") # this is to ensure exiting when in daemon mode. sys.exit() def __str__(self): """ return: string String representation of the status of this component. """ msg = 'Status of this component : \n' msg += '\n' msg += '>>Event Subscriptions --> Handlers<<\n' msg += '------------------------------------\n' for message in self.messages: msg += message + '-->' + str(self.messages[message]) + '\n' msg += '\n' msg += '\n' msg += '>>Parameters --> Values<<\n' msg += '-------------------------\n' msg += str(self.config) additionalMsg = self.logState() if additionalMsg != '': msg += '\n' msg += 'Additional state information\n' msg += '----------------------------\n' msg += '\n' msg += str(additionalMsg) msg += '\n' return msg
def __init__(self, slaveClassName, totalSlaves, componentDir, config, namespace = 'WMComponent', inPort = '5555', outPort = '5558'): """ __init__ Constructor for the process pool. The slave class name must be based inside the WMComponent namespace. For examples, the JobAccountant would pass in 'JobAccountant.AccountantWorker' to run the AccountantWorker class. All log files will be stored in the component directory that is passed in. Each slave will have its own log file. Note that the config is only used to determine database connection parameters. It is not passed to the slave class. The slaveInit parameter will be serialized and passed to the slave class's constructor. """ self.enqueueIndex = 0 self.dequeueIndex = 0 self.runningWork = 0 #Use the Services.Requests JSONizer, which handles __to_json__ calls self.jsonHandler = JSONRequests() # heartbeat should be registered at this point if getattr(config.Agent, "useHeartbeat", True): self.heartbeatAPI = HeartbeatAPI(getattr(config.Agent, "componentName", "ProcPoolSlave")) self.slaveClassName = slaveClassName self.componentDir = componentDir self.config = config # Grab the python version from the current version # Assume naming convention pythonA.B, i.e., python2.4 for v2.4.X majorVersion = sys.version_info[0] minorVersion = sys.version_info[1] if majorVersion and minorVersion: self.versionString = "python%i.%i" % (majorVersion, minorVersion) else: self.versionString = "python2.6" self.workers = [] self.nSlaves = totalSlaves self.namespace = namespace self.inPort = inPort self.outPort = outPort # Pickle the config self.configPath = os.path.join(componentDir, '%s_config.pkl' % slaveClassName) if os.path.exists(self.configPath): # Then we note it and overwrite it msg = "Something's in the way of the ProcessPool config: %s" % self.configPath logging.error(msg) f = open(self.configPath, 'w') cPickle.dump(config, f) f.close() # Set up ZMQ try: context = zmq.Context() self.sender = context.socket(zmq.PUSH) self.sender.bind("tcp://*:%s" % inPort) self.sink = context.socket(zmq.PULL) self.sink.bind("tcp://*:%s" % outPort) except zmq.ZMQError: # Try this again in a moment to see # if it's just being held by something pre-existing import time time.sleep(1) logging.error("Blocked socket on startup: Attempting sleep to give it time to clear.") try: context = zmq.Context() self.sender = context.socket(zmq.PUSH) self.sender.bind("tcp://*:%s" % inPort) self.sink = context.socket(zmq.PULL) self.sink.bind("tcp://*:%s" % outPort) except Exception as ex: msg = "Error attempting to open TCP sockets\n" msg += str(ex) logging.error(msg) import traceback print traceback.format_exc() raise ProcessPoolException(msg) # Now actually create the slaves self.createSlaves() return
class BossAirTest(unittest.TestCase): """ Tests for the BossAir prototype """ sites = ['T2_US_UCSD', 'T2_TW_Taiwan', 'T1_CH_CERN', 'T2_US_Florida'] def setUp(self): """ setup for test. """ myThread = threading.currentThread() self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.tearDown() self.testInit.setSchema(customModules = ["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"], useDefault = False) self.testInit.setupCouch("bossair_t/jobs", "JobDump") self.testInit.setupCouch("bossair_t/fwjrs", "FWJRDump") self.daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) self.getJobs = self.daoFactory(classname = "Jobs.GetAllJobs") #Create sites in resourceControl resourceControl = ResourceControl() for site in self.sites: resourceControl.insertSite(siteName = site, seName = 'se.%s' % (site), cmsName = site, ceName = site, plugin = "CondorPlugin", pendingSlots = 1000, runningSlots = 2000) resourceControl.insertThreshold(siteName = site, taskType = 'Processing', \ maxSlots = 1000, pendingSlots = 1000) resourceControl.insertSite(siteName = 'Xanadu', seName = 'se.Xanadu',cmsName = site, ceName = 'Xanadu', plugin = "TestPlugin") resourceControl.insertThreshold(siteName = 'Xanadu', taskType = 'Processing', \ maxSlots = 10000, pendingSlots = 10000) resourceControl.insertSite(siteName = 'jade-cms.hip.fi', seName = 'madhatter.csc.fi', cmsName = site, ceName = 'jade-cms.hip.fi', plugin = "ARCPlugin") resourceControl.insertThreshold(siteName = 'jade-cms.hip.fi', taskType = 'Processing', \ maxSlots = 100, pendingSlots = 100) # using this for glite submissions resourceControl.insertSite(siteName = 'grid-ce-01.ba.infn.it', seName = 'storm-se-01.ba.infn.it', cmsName = site, ceName = 'grid-ce-01.ba.infn.it', plugin = 'gLitePlugin') resourceControl.insertThreshold(siteName = 'grid-ce-01.ba.infn.it', taskType = 'Processing', \ maxSlots = 50, pendingSlots = 50) # Create user newuser = self.daoFactory(classname = "Users.New") newuser.execute(dn = "tapas", group_name = "phgroup", role_name = "cmsrole") # We actually need the user name self.user = getpass.getuser() # Change this to the working dir to keep track of error and log files from condor self.testInit.generateWorkDir() # Set heartbeat componentName = 'test' self.heartbeatAPI = HeartbeatAPI(componentName) self.heartbeatAPI.registerComponent() componentName = 'JobTracker' self.heartbeatAPI2 = HeartbeatAPI(componentName) self.heartbeatAPI2.registerComponent() return def tearDown(self): """ Database deletion """ #self.testInit.clearDatabase(modules = ["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"]) self.testInit.delWorkDir() self.testInit.tearDownCouch() return def getConfig(self): """ _getConfig_ Build a basic BossAir config """ config = self.testInit.getConfiguration() config.section_("Agent") config.Agent.agentName = 'testAgent' config.Agent.componentName = 'test' config.Agent.useHeartbeat = False config.section_("CoreDatabase") config.CoreDatabase.connectUrl = os.getenv("DATABASE") config.CoreDatabase.socket = os.getenv("DBSOCK") config.section_("BossAir") config.BossAir.pluginNames = ['TestPlugin', 'CondorPlugin'] config.BossAir.pluginDir = 'WMCore.BossAir.Plugins' config.BossAir.UISetupScript = '/afs/cern.ch/cms/LCG/LCG-2/UI/cms_ui_env.sh' config.component_("JobSubmitter") config.JobSubmitter.logLevel = 'INFO' config.JobSubmitter.pollInterval = 1 config.JobSubmitter.pluginName = 'AirPlugin' config.JobSubmitter.pluginDir = 'JobSubmitter.Plugins' config.JobSubmitter.submitDir = os.path.join(self.testDir, 'submit') config.JobSubmitter.submitNode = os.getenv("HOSTNAME", 'stevia.hep.wisc.edu') config.JobSubmitter.submitScript = os.path.join(WMCore.WMInit.getWMBASE(), 'test/python/WMComponent_t/JobSubmitter_t', 'submit.sh') config.JobSubmitter.componentDir = os.path.join(os.getcwd(), 'Components') config.JobSubmitter.workerThreads = 2 config.JobSubmitter.jobsPerWorker = 200 config.JobSubmitter.gLiteConf = os.path.join(os.getcwd(), 'config.cfg') # JobTracker config.component_("JobTracker") config.JobTracker.logLevel = 'INFO' config.JobTracker.pollInterval = 1 # JobStateMachine config.component_('JobStateMachine') config.JobStateMachine.couchurl = os.getenv('COUCHURL') config.JobStateMachine.couchDBName = "bossair_t" # JobStatusLite config.component_('JobStatusLite') config.JobStatusLite.componentDir = os.path.join(os.getcwd(), 'Components') config.JobStatusLite.stateTimeouts = {'Pending': 10, 'Running': 86400} config.JobStatusLite.pollInterval = 1 return config def createTestWorkload(self, workloadName = 'Test', emulator = True): """ _createTestWorkload_ Creates a test workload for us to run on, hold the basic necessities. """ workload = testWorkload("Tier1ReReco") rereco = workload.getTask("ReReco") taskMaker = TaskMaker(workload, os.path.join(self.testDir, 'workloadTest')) taskMaker.skipSubscription = True taskMaker.processWorkload() workload.save(workloadName) return workload def createJobGroups(self, nSubs, nJobs, task, workloadSpec, site = None, bl = [], wl = []): """ Creates a series of jobGroups for submissions """ jobGroupList = [] testWorkflow = Workflow(spec = workloadSpec, owner = "tapas", name = makeUUID(), task="basicWorkload/Production", owner_vogroup = 'phgroup', owner_vorole = 'cmsrole') testWorkflow.create() # Create subscriptions for i in range(nSubs): name = makeUUID() # Create Fileset, Subscription, jobGroup testFileset = Fileset(name = name) testFileset.create() testSubscription = Subscription(fileset = testFileset, workflow = testWorkflow, type = "Processing", split_algo = "FileBased") testSubscription.create() testJobGroup = JobGroup(subscription = testSubscription) testJobGroup.create() # Create jobs self.makeNJobs(name = name, task = task, nJobs = nJobs, jobGroup = testJobGroup, fileset = testFileset, sub = testSubscription.exists(), site = site, bl = bl, wl = wl) testFileset.commit() testJobGroup.commit() jobGroupList.append(testJobGroup) return jobGroupList def makeNJobs(self, name, task, nJobs, jobGroup, fileset, sub, site = None, bl = [], wl = []): """ _makeNJobs_ Make and return a WMBS Job and File This handles all those damn add-ons """ # Set the CacheDir cacheDir = os.path.join(self.testDir, 'CacheDir') for n in range(nJobs): # First make a file #site = self.sites[0] testFile = File(lfn = "/singleLfn/%s/%s" %(name, n), size = 1024, events = 10) if site: testFile.setLocation(site) else: for tmpSite in self.sites: testFile.setLocation('se.%s' % (tmpSite)) testFile.create() fileset.addFile(testFile) fileset.commit() index = 0 for f in fileset.files: index += 1 testJob = Job(name = '%s-%i' %(name, index)) testJob.addFile(f) testJob["location"] = f.getLocations()[0] testJob['custom']['location'] = f.getLocations()[0] testJob['task'] = task.getPathName() testJob['sandbox'] = task.data.input.sandbox testJob['spec'] = os.path.join(self.testDir, 'basicWorkload.pcl') testJob['mask']['FirstEvent'] = 101 testJob['owner'] = 'tapas' testJob["siteBlacklist"] = bl testJob["siteWhitelist"] = wl testJob['ownerDN'] = 'tapas' testJob['ownerRole'] = 'cmsrole' testJob['ownerGroup'] = 'phgroup' jobCache = os.path.join(cacheDir, 'Sub_%i' % (sub), 'Job_%i' % (index)) os.makedirs(jobCache) testJob.create(jobGroup) testJob['cache_dir'] = jobCache testJob.save() jobGroup.add(testJob) output = open(os.path.join(jobCache, 'job.pkl'),'w') pickle.dump(testJob, output) output.close() return testJob, testFile def createDummyJobs(self, nJobs, location = None): """ _createDummyJobs_ Create some dummy jobs """ if not location: location = self.sites[0] nameStr = makeUUID() testWorkflow = Workflow(spec = nameStr, owner = "tapas", name = nameStr, task="basicWorkload/Production", owner_vogroup = 'phgroup', owner_vorole = 'cmsrole') testWorkflow.create() testFileset = Fileset(name = nameStr) testFileset.create() testSubscription = Subscription(fileset = testFileset, workflow = testWorkflow, type = "Processing", split_algo = "FileBased") testSubscription.create() testJobGroup = JobGroup(subscription = testSubscription) testJobGroup.create() jobList = [] for i in range(nJobs): testJob = Job(name = '%s-%i' % (nameStr, i)) testJob['location'] = location testJob['custom']['location'] = location testJob['userdn'] = 'tapas' testJob['owner'] = 'tapas' testJob['userrole'] = 'cmsrole' testJob['usergroup'] = 'phgroup' testJob.create(testJobGroup) jobList.append(testJob) return jobList @attr('integration') def testA_APITest(self): """ _APITest_ This is a commissioning test that has very little to do with anything except loading the code. """ #return myThread = threading.currentThread() config = self.getConfig() baAPI = BossAirAPI(config = config) # We should have loaded a plugin self.assertTrue('TestPlugin' in baAPI.plugins.keys()) result = myThread.dbi.processData("SELECT name FROM bl_status")[0].fetchall() statusList = [] for i in result: statusList.append(i.values()[0]) # We should have the plugin states in the database self.assertEqual(statusList.sort(), ['New', 'Dead', 'Gone'].sort()) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs = nJobs) print jobDummies baAPI.createNewJobs(wmbsJobs = jobDummies) runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), nJobs) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nJobs) deadJobs = baAPI._loadByStatus(status = 'Dead') self.assertEqual(len(deadJobs), 0) raisesException = False self.assertRaises(BossAirException, baAPI._loadByStatus, status = 'FalseStatus') # Change the job status and update it for job in newJobs: job['status'] = 'Dead' baAPI._updateJobs(jobs = newJobs) # Test whether we see the job status as updated newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) deadJobs = baAPI._loadByStatus(status = 'Dead') self.assertEqual(len(deadJobs), nJobs) # Can we load by BossAir ID? loadedJobs = baAPI._loadByID(jobs = deadJobs) self.assertEqual(len(loadedJobs), nJobs) # Can we load via WMBS? loadedJobs = baAPI.loadByWMBS(wmbsJobs = jobDummies) self.assertEqual(len(loadedJobs), nJobs) # See if we can delete jobs baAPI._deleteJobs(jobs = deadJobs) # Confirm that they're gone deadJobs = baAPI._loadByStatus(status = 'Dead') self.assertEqual(len(deadJobs), 0) self.assertEqual(len(baAPI.jobs), 0) return @attr('integration') def testB_PluginTest(self): """ _PluginTest_ Now check that these functions worked if called through plugins Instead of directly. There are only three plugin """ #return myThread = threading.currentThread() config = self.getConfig() baAPI = BossAirAPI(config = config) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs = nJobs, location = 'Xanadu') changeState = ChangeState(config) changeState.propagate(jobDummies, 'created', 'new') changeState.propagate(jobDummies, 'executing', 'created') # Prior to building the job, each job must have a plugin # and user assigned for job in jobDummies: job['plugin'] = 'TestPlugin' job['owner'] = 'tapas' baAPI.submit(jobs = jobDummies) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nJobs) # Should be no more running jobs runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), nJobs) # Test Plugin should complete all jobs baAPI.track() # Should be no more running jobs runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), 0) # Check if they're complete completeJobs = baAPI.getComplete() self.assertEqual(len(completeJobs), nJobs) # Do this test because BossAir is specifically built # to keep it from finding completed jobs result = myThread.dbi.processData("SELECT id FROM bl_runjob")[0].fetchall() self.assertEqual(len(result), nJobs) baAPI.removeComplete(jobs = jobDummies) result = myThread.dbi.processData("SELECT id FROM bl_runjob")[0].fetchall() self.assertEqual(len(result), 0) return def testG_monitoringDAO(self): """ _monitoringDAO_ Because I need a test for the monitoring DAO """ return myThread = threading.currentThread() config = self.getConfig() changeState = ChangeState(config) baAPI = BossAirAPI(config = config) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs = nJobs) # Prior to building the job, each job must have a plugin # and user assigned for job in jobDummies: job['plugin'] = 'TestPlugin' job['owner'] = 'tapas' job['location'] = 'T2_US_UCSD' job.save() baAPI.submit(jobs = jobDummies) results = baAPI.monitor() self.assertEqual(len(results), nJobs) for job in results: self.assertEqual(job['plugin'], 'CondorPlugin') return
class DBSUploadTest(unittest.TestCase): """ TestCase for DBSUpload module Note: This fails if you use the in-memory syntax for sqlite i.e. (DATABASE = sqlite://) """ _maxMessage = 10 def setUp(self): """ _setUp_ setUp function for unittest """ # Set constants self.couchDB = "config_test" self.configURL = "RANDOM;;URL;;NAME" self.configString = "This is a random string" self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules = ["WMComponent.DBS3Buffer", 'WMCore.Agent.Database'], useDefault = False) self.testInit.setupCouch(self.couchDB, "GroupUser", "ConfigCache") myThread = threading.currentThread() self.bufferFactory = DAOFactory(package = "WMComponent.DBSBuffer.Database", logger = myThread.logger, dbinterface = myThread.dbi) locationAction = self.bufferFactory(classname = "DBSBufferFiles.AddLocation") locationAction.execute(siteName = "se1.cern.ch") locationAction.execute(siteName = "se1.fnal.gov") locationAction.execute(siteName = "malpaquet") # Set heartbeat self.componentName = 'JobSubmitter' self.heartbeatAPI = HeartbeatAPI(self.componentName) self.heartbeatAPI.registerComponent() # Set up a config cache configCache = ConfigCache(os.environ["COUCHURL"], couchDBName = self.couchDB) configCache.createUserGroup(groupname = "testGroup", username = '******') self.testDir = self.testInit.generateWorkDir() psetPath = os.path.join(self.testDir, "PSet.txt") f = open(psetPath, 'w') f.write(self.configString) f.close() configCache.addConfig(newConfig = psetPath, psetHash = None) configCache.save() self.configURL = "%s;;%s;;%s" % (os.environ["COUCHURL"], self.couchDB, configCache.getCouchID()) return def tearDown(self): """ _tearDown_ tearDown function for unittest """ self.testInit.clearDatabase(modules = ["WMComponent.DBS3Buffer", 'WMCore.Agent.Database']) def createConfig(self): """ _createConfig_ This creates the actual config file used by the component """ config = Configuration() #First the general stuff config.section_("General") config.General.workDir = os.getenv("TESTDIR", os.getcwd()) config.section_("Agent") config.Agent.componentName = 'DBSUpload' config.Agent.useHeartbeat = False #Now the CoreDatabase information #This should be the dialect, dburl, etc config.section_("CoreDatabase") config.CoreDatabase.connectUrl = os.getenv("DATABASE") config.CoreDatabase.socket = os.getenv("DBSOCK") config.component_("DBSUpload") config.DBSUpload.pollInterval = 10 config.DBSUpload.logLevel = 'ERROR' config.DBSUpload.maxThreads = 1 config.DBSUpload.namespace = 'WMComponent.DBSUpload.DBSUpload' config.DBSUpload.componentDir = os.path.join(os.getcwd(), 'Components') config.DBSUpload.workerThreads = 4 config.section_("DBSInterface") config.DBSInterface.globalDBSUrl = 'http://vocms09.cern.ch:8880/cms_dbs_int_local_xx_writer/servlet/DBSServlet' config.DBSInterface.globalDBSVersion = 'DBS_2_0_9' config.DBSInterface.DBSUrl = 'http://vocms09.cern.ch:8880/cms_dbs_int_local_yy_writer/servlet/DBSServlet' config.DBSInterface.DBSVersion = 'DBS_2_0_9' config.DBSInterface.DBSBlockMaxFiles = 10 config.DBSInterface.DBSBlockMaxSize = 9999999999 config.DBSInterface.DBSBlockMaxTime = 10000 config.DBSInterface.MaxFilesToCommit = 10 # addition for Alerts messaging framework, work (alerts) and control # channel addresses to which the component will be sending alerts # these are destination addresses where AlertProcessor:Receiver listens config.section_("Alert") config.Alert.address = "tcp://127.0.0.1:5557" config.Alert.controlAddr = "tcp://127.0.0.1:5559" # configure threshold of DBS upload queue size alert threshold # reference: trac ticket #1628 config.DBSUpload.alertUploadQueueSize = 2000 return config def getFiles(self, name, tier, nFiles = 12, site = "malpaquet"): """ Create some quick dummy test files """ files = [] for f in range(0, nFiles): testFile = DBSBufferFile(lfn = '%s-%s-%i' % (name, site, f), size = 1024, events = 20, checksums = {'cksum': 1}) testFile.setAlgorithm(appName = name, appVer = "CMSSW_3_1_1", appFam = "RECO", psetHash = "GIBBERISH", configContent = self.configURL) testFile.setDatasetPath("/%s/%s/%s" % (name, name, tier)) testFile.addRun(Run( 1, *[f])) testFile.setGlobalTag("aGlobalTag") testFile.create() testFile.setLocation(site) files.append(testFile) testFileChild = DBSBufferFile(lfn = '%s-%s-child' %(name, site), size = 1024, events = 10, checksums = {'cksum': 1}) testFileChild.setAlgorithm(appName = name, appVer = "CMSSW_3_1_1", appFam = "RECO", psetHash = "GIBBERISH", configContent = self.configURL) testFileChild.setDatasetPath("/%s/%s_2/RECO" %(name, name)) testFileChild.addRun(Run( 1, *[45])) testFileChild.setGlobalTag("aGlobalTag") testFileChild.create() testFileChild.setLocation(site) testFileChild.addParents([x['lfn'] for x in files]) return files @attr('integration') def testA_basicUploadTest(self): """ _basicUploadTest_ Do everything simply once Create dataset, algo, files, blocks, upload them, mark as done, finish them, migrate them Also check the timeout """ myThread = threading.currentThread() config = self.createConfig() config.DBSInterface.DBSBlockMaxTime = 3 config.DBSUpload.pollInterval = 4 name = "ThisIsATest_%s" % (makeUUID()) tier = "RECO" nFiles = 12 files = self.getFiles(name = name, tier = tier, nFiles = nFiles) datasetPath = '/%s/%s/%s' % (name, name, tier) # Load components that are necessary to check status factory = WMFactory("dbsUpload", "WMComponent.DBSUpload.Database.Interface") dbinterface = factory.loadObject("UploadToDBS") dbsInterface = DBSInterface(config = config) localAPI = dbsInterface.getAPIRef() globeAPI = dbsInterface.getAPIRef(globalRef = True) # In the first round we should create blocks for the first dataset # The child dataset should not be handled until the parent is uploaded testDBSUpload = DBSUploadPoller(config = config) testDBSUpload.algorithm() # First, see if there are any blocks # One in DBS, one not in DBS result = myThread.dbi.processData("SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(len(result), 2) self.assertEqual(result, [('InGlobalDBS',), ('Open',)]) # Check to see if datasets and algos are in local DBS result = listAlgorithms(apiRef = localAPI, patternExe = name) self.assertEqual(len(result), 1) self.assertEqual(result[0]['ExecutableName'], name) result = listPrimaryDatasets(apiRef = localAPI, match = name) self.assertEqual(result, [name]) result = listProcessedDatasets(apiRef = localAPI, primary = name, dataTier = "*") # Then check and see that the closed block made it into local DBS affectedBlocks = listBlocks(apiRef = localAPI, datasetPath = datasetPath) if affectedBlocks[0]['OpenForWriting'] == '0': self.assertEqual(affectedBlocks[1]['OpenForWriting'], '1') self.assertEqual(affectedBlocks[0]['NumberOfFiles'], 10) self.assertEqual(affectedBlocks[1]['NumberOfFiles'], 2) else: self.assertEqual(affectedBlocks[0]['OpenForWriting'], '1') self.assertEqual(affectedBlocks[1]['NumberOfFiles'], 10) self.assertEqual(affectedBlocks[0]['NumberOfFiles'], 2) # Check to make sure all the files are in local result = listDatasetFiles(apiRef = localAPI, datasetPath = datasetPath) fileLFNs = [x['lfn'] for x in files] for lfn in fileLFNs: self.assertTrue(lfn in result) # Make sure the child files aren't there flag = False try: listDatasetFiles(apiRef = localAPI, datasetPath = '/%s/%s_2/%s' % (name, name, tier)) except Exception, ex: flag = True self.assertTrue(flag) # There should be one blocks in global # It should have ten files and be closed result = listBlocks(apiRef = globeAPI, datasetPath = datasetPath) self.assertEqual(len(result), 1) for block in result: self.assertEqual(block['OpenForWriting'], '0') self.assertTrue(block['NumberOfFiles'] in [2, 10]) # Okay, deep breath. First round done # In the second round, the second block of the parent fileset should transfer # Make sure that the timeout functions work time.sleep(10) testDBSUpload.algorithm() result = myThread.dbi.processData("SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(len(result), 2) self.assertEqual(result, [('InGlobalDBS',), ('InGlobalDBS',)]) # Check to make sure all the files are in global result = listDatasetFiles(apiRef = globeAPI, datasetPath = datasetPath) for lfn in fileLFNs: self.assertTrue(lfn in result) # Make sure the child files aren't there flag = False try: listDatasetFiles(apiRef = localAPI, datasetPath = '/%s/%s_2/%s' % (name, name, tier)) except Exception, ex: flag = True
class JobSubmitterTest(unittest.TestCase): """ Test class for the JobSubmitter """ sites = ["T2_US_Florida", "T2_US_UCSD", "T2_TW_Taiwan", "T1_CH_CERN"] def setUp(self): """ Standard setup: Now with 100% more couch """ self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection(destroyAllDatabase=True) self.testInit.setSchema( customModules=["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"], useDefault=False, ) self.testInit.setupCouch("jobsubmitter_t/jobs", "JobDump") self.testInit.setupCouch("jobsubmitter_t/fwjrs", "FWJRDump") myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) locationAction = self.daoFactory(classname="Locations.New") locationSlots = self.daoFactory(classname="Locations.SetJobSlots") # We actually need the user name self.user = getpass.getuser() self.ceName = "127.0.0.1" # Create sites in resourceControl resourceControl = ResourceControl() for site in self.sites: resourceControl.insertSite( siteName=site, seName="se.%s" % (site), ceName=site, plugin="CondorPlugin", pendingSlots=10000, runningSlots=20000, cmsName=site, ) resourceControl.insertThreshold(siteName=site, taskType="Processing", maxSlots=10000) self.testDir = self.testInit.generateWorkDir() # Set heartbeat self.componentName = "JobSubmitter" self.heartbeatAPI = HeartbeatAPI(self.componentName) self.heartbeatAPI.registerComponent() return def tearDown(self): """ Standard tearDown """ self.testInit.clearDatabase( modules=["WMCore.WMBS", "WMCore.ResourceControl", "WMCore.BossAir", "WMCore.Agent.Database"] ) self.testInit.delWorkDir() self.testInit.tearDownCouch() return def createJobGroups(self, nSubs, nJobs, task, workloadSpec, site=None, bl=[], wl=[], type="Processing"): """ Creates a series of jobGroups for submissions """ jobGroupList = [] testWorkflow = Workflow(spec=workloadSpec, owner="mnorman", name=makeUUID(), task="basicWorkload/Production") testWorkflow.create() # Create subscriptions for i in range(nSubs): name = makeUUID() # Create Fileset, Subscription, jobGroup testFileset = Fileset(name=name) testFileset.create() testSubscription = Subscription( fileset=testFileset, workflow=testWorkflow, type=type, split_algo="FileBased" ) testSubscription.create() testJobGroup = JobGroup(subscription=testSubscription) testJobGroup.create() # Create jobs self.makeNJobs( name=name, task=task, nJobs=nJobs, jobGroup=testJobGroup, fileset=testFileset, sub=testSubscription.exists(), site=site, bl=bl, wl=wl, ) testFileset.commit() testJobGroup.commit() jobGroupList.append(testJobGroup) return jobGroupList def makeNJobs(self, name, task, nJobs, jobGroup, fileset, sub, site=None, bl=[], wl=[]): """ _makeNJobs_ Make and return a WMBS Job and File This handles all those damn add-ons """ # Set the CacheDir cacheDir = os.path.join(self.testDir, "CacheDir") for n in range(nJobs): # First make a file # site = self.sites[0] testFile = File(lfn="/singleLfn/%s/%s" % (name, n), size=1024, events=10) if site: testFile.setLocation(site) else: for tmpSite in self.sites: testFile.setLocation("se.%s" % (tmpSite)) testFile.create() fileset.addFile(testFile) fileset.commit() index = 0 for f in fileset.files: index += 1 testJob = Job(name="%s-%i" % (name, index)) testJob.addFile(f) testJob["location"] = f.getLocations()[0] testJob["task"] = task.getPathName() testJob["sandbox"] = task.data.input.sandbox testJob["spec"] = os.path.join(self.testDir, "basicWorkload.pcl") testJob["mask"]["FirstEvent"] = 101 testJob["siteBlacklist"] = bl testJob["siteWhitelist"] = wl testJob["priority"] = 101 jobCache = os.path.join(cacheDir, "Sub_%i" % (sub), "Job_%i" % (index)) os.makedirs(jobCache) testJob.create(jobGroup) testJob["cache_dir"] = jobCache testJob.save() jobGroup.add(testJob) output = open(os.path.join(jobCache, "job.pkl"), "w") pickle.dump(testJob, output) output.close() return testJob, testFile def getConfig( self, configPath=os.path.join(WMCore.WMInit.getWMBASE(), "src/python/WMComponent/JobSubmitter/DefaultConfig.py") ): """ _getConfig_ Gets a basic config from default location """ myThread = threading.currentThread() config = Configuration() config.component_("Agent") config.Agent.WMSpecDirectory = self.testDir config.Agent.agentName = "testAgent" config.Agent.componentName = self.componentName config.Agent.useHeartbeat = False # First the general stuff config.section_("General") config.General.workDir = os.getenv("TESTDIR", self.testDir) # Now the CoreDatabase information # This should be the dialect, dburl, etc config.section_("CoreDatabase") config.CoreDatabase.connectUrl = os.getenv("DATABASE") config.CoreDatabase.socket = os.getenv("DBSOCK") config.section_("BossAir") config.BossAir.pluginNames = ["TestPlugin", "CondorPlugin"] config.BossAir.pluginDir = "WMCore.BossAir.Plugins" config.component_("JobSubmitter") config.JobSubmitter.logLevel = "INFO" config.JobSubmitter.maxThreads = 1 config.JobSubmitter.pollInterval = 10 config.JobSubmitter.pluginName = "CondorGlobusPlugin" config.JobSubmitter.pluginDir = "JobSubmitter.Plugins" config.JobSubmitter.submitNode = os.getenv("HOSTNAME", "badtest.fnal.gov") config.JobSubmitter.submitScript = os.path.join( WMCore.WMBase.getTestBase(), "WMComponent_t/JobSubmitter_t", "submit.sh" ) config.JobSubmitter.componentDir = os.path.join(self.testDir, "Components") config.JobSubmitter.workerThreads = 2 config.JobSubmitter.jobsPerWorker = 200 config.JobSubmitter.inputFile = os.path.join( WMCore.WMBase.getTestBase(), "WMComponent_t/JobSubmitter_t", "FrameworkJobReport-4540.xml" ) config.JobSubmitter.deleteJDLFiles = False # JobStateMachine config.component_("JobStateMachine") config.JobStateMachine.couchurl = os.getenv("COUCHURL") config.JobStateMachine.couchDBName = "jobsubmitter_t" # Needed, because this is a test os.makedirs(config.JobSubmitter.componentDir) return config def createTestWorkload(self, workloadName="Test", emulator=True): """ _createTestWorkload_ Creates a test workload for us to run on, hold the basic necessities. """ workload = testWorkload("Tier1ReReco") rereco = workload.getTask("ReReco") taskMaker = TaskMaker(workload, os.path.join(self.testDir, "workloadTest")) taskMaker.skipSubscription = True taskMaker.processWorkload() return workload def checkJDL(self, config, cacheDir, submitFile, site=None, indexFlag=False, noIndex=False): """ _checkJDL_ Check the basic JDL setup """ jobs, head = parseJDL(jdlLocation=os.path.join(config.JobSubmitter.submitDir, submitFile)) batch = 1 # Check each job entry in the JDL for job in jobs: # Check each key index = int(job.get("+WMAgent_JobID", 0)) self.assertTrue(index != 0) argValue = index - 1 if indexFlag: batch = index - 1 inputFileString = "%s, %s, %s" % ( os.path.join(self.testDir, "workloadTest/TestWorkload", "TestWorkload-Sandbox.tar.bz2"), os.path.join( self.testDir, "workloadTest/TestWorkload", "PackageCollection_0/batch_%i-0/JobPackage.pkl" % (batch) ), os.path.join(WMCore.WMInit.getWMBASE(), "src/python/WMCore", "WMRuntime/Unpacker.py"), ) if not noIndex: self.assertEqual(job.get("transfer_input_files", None), inputFileString) # Arguments use a list starting from 0 self.assertEqual(job.get("arguments", None), "TestWorkload-Sandbox.tar.bz2 %i" % (index)) if site: self.assertEqual(job.get("+DESIRED_Sites", None), '"%s"' % site) # Check the priority self.assertEqual(job.get("priority", None), "101") # Now handle the head self.assertEqual(head.get("should_transfer_files", None), "YES") self.assertEqual(head.get("Log", None), "condor.$(Cluster).$(Process).log") self.assertEqual(head.get("Error", None), "condor.$(Cluster).$(Process).err") self.assertEqual(head.get("Output", None), "condor.$(Cluster).$(Process).out") self.assertEqual(head.get("when_to_transfer_output", None), "ON_EXIT") self.assertEqual(head.get("Executable", None), config.JobSubmitter.submitScript) return @attr("integration") def testA_BasicTest(self): """ Use the CondorGlobusPlugin to create a very simple test Check to see that all the jobs were submitted Parse and test the JDL files See what condor says """ workloadName = "basicWorkload" myThread = threading.currentThread() workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 1 nJobs = 10 cacheDir = os.path.join(self.testDir, "CacheDir") jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site="se.T2_US_UCSD", ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") # Do pre-submit check getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() # Check that jobs are in the right state result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Check assigned locations getLocationAction = self.daoFactory(classname="Jobs.GetLocation") for id in result: loc = getLocationAction.execute(jobid=id) self.assertEqual(loc, [["T2_US_UCSD"]]) # Check on the JDL submitFile = None for file in os.listdir(config.JobSubmitter.submitDir): if re.search("submit", file): submitFile = file self.assertTrue(submitFile != None) self.checkJDL(config=config, cacheDir=cacheDir, submitFile=submitFile, site="T2_US_UCSD") # if os.path.exists('CacheDir'): # shutil.rmtree('CacheDir') # shutil.copytree(self.testDir, 'CacheDir') # Check to make sure we have running jobs nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs * nSubs) # This should do nothing jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site="se.T2_US_UCSD", ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") jobSubmitter.algorithm() # Now clean-up command = ["condor_rm", self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() del jobSubmitter return @attr("performance") def testB_TimeLongSubmission(self): """ _TimeLongSubmission_ Submit a lot of jobs and test how long it takes for them to actually be submitted """ return workloadName = "basicWorkload" myThread = threading.currentThread() workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 5 nJobs = 300 cacheDir = os.path.join(self.testDir, "CacheDir") jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) jobSubmitter = JobSubmitterPoller(config=config) # Actually run it startTime = time.time() cProfile.runctx("jobSubmitter.algorithm()", globals(), locals(), filename="testStats.stat") # jobSubmitter.algorithm() stopTime = time.time() if os.path.isdir("CacheDir"): shutil.rmtree("CacheDir") shutil.copytree("%s" % self.testDir, os.path.join(os.getcwd(), "CacheDir")) # Check to make sure we have running jobs nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs * nSubs) # Now clean-up command = ["condor_rm", self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() print "Job took %f seconds to complete" % (stopTime - startTime) p = pstats.Stats("testStats.stat") p.sort_stats("cumulative") p.print_stats() return def testD_CreamCETest(self): """ _CreamCETest_ This is for submitting to Cream CEs. Don't use it. """ return nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) workloadName = "basicWorkload" myThread = threading.currentThread() workload = self.createTestWorkload() config = self.getConfig() config.JobSubmitter.pluginName = "CreamPlugin" changeState = ChangeState(config) nSubs = 1 nJobs = 10 cacheDir = os.path.join(self.testDir, "CacheDir") # Add a new site siteName = "creamSite" ceName = "https://cream-1-fzk.gridka.de:8443/ce-cream/services/CREAM2 pbs cmsXS" # ceName = "127.0.0.1" locationAction = self.daoFactory(classname="Locations.New") pendingSlots = self.daoFactory(classname="Locations.SetPendingSlots") locationAction.execute(siteName=siteName, seName=siteName, ceName=ceName) pendingSlots.execute(siteName=siteName, pendingSlots=1000) resourceControl = ResourceControl() resourceControl.insertSite(siteName=siteName, seName=siteName, ceName=ceName) resourceControl.insertThreshold(siteName=siteName, taskType="Processing", maxSlots=10000) jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site=siteName, ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() # Check that jobs are in the right state getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Now clean-up command = ["condor_rm", self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() if os.path.exists("CacheDir"): shutil.rmtree("CacheDir") shutil.copytree(self.testDir, "CacheDir") return @attr("integration") def testE_WhiteListBlackList(self): """ _WhiteListBlackList_ Test the whitelist/blacklist implementation Trust the jobCreator to get this in the job right """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) workloadName = "basicWorkload" myThread = threading.currentThread() workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 2 nJobs = 10 cacheDir = os.path.join(self.testDir, "CacheDir") jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), bl=["T2_US_Florida", "T2_TW_Taiwan", "T1_CH_CERN"], ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") jobSubmitter = JobSubmitterPoller(config=config) # Actually run it jobSubmitter.algorithm() if os.path.isdir("CacheDir"): shutil.rmtree("CacheDir") shutil.copytree("%s" % self.testDir, os.path.join(os.getcwd(), "CacheDir")) # Check to make sure we have running jobs nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs * nSubs) getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # All jobs should be at UCSD submitFile = None for file in os.listdir(config.JobSubmitter.submitDir): if re.search("submit", file): submitFile = file self.assertTrue(submitFile != None) # submitFile = os.listdir(config.JobSubmitter.submitDir)[0] self.checkJDL(config=config, cacheDir=cacheDir, submitFile=submitFile, site="T2_US_UCSD") # Now clean-up command = ["condor_rm", self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() # Run again and test the whiteList jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), wl=["T2_US_UCSD"], ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) jobSubmitter = JobSubmitterPoller(config=config) # Actually run it jobSubmitter.algorithm() if os.path.isdir("CacheDir"): shutil.rmtree("CacheDir") shutil.copytree("%s" % self.testDir, os.path.join(os.getcwd(), "CacheDir")) # Check to make sure we have running jobs nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs * nSubs) # You'll have jobs from the previous run still in the database result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs * 2) # All jobs should be at UCSD submitFile = None for file in os.listdir(config.JobSubmitter.submitDir): if re.search("submit", file): submitFile = file self.assertTrue(submitFile != None) self.checkJDL(config=config, cacheDir=cacheDir, submitFile=submitFile, site="T2_US_UCSD", noIndex=True) # Now clean-up command = ["condor_rm", self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() # Run again with an invalid whitelist # NOTE: After this point, the original two sets of jobs will be executing # The rest of the jobs should move to submitFailed jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), wl=["T2_US_Namibia"], ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) jobSubmitter = JobSubmitterPoller(config=config) # Actually run it jobSubmitter.algorithm() # Check to make sure we have running jobs # nRunning = getCondorRunningJobs(self.user) # self.assertEqual(nRunning, 0) # Jobs should be gone getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs * 2) result = getJobsAction.execute(state="SubmitFailed", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Now clean-up command = ["condor_rm", self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() # Run again with all sites blacklisted jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), bl=self.sites, ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) jobSubmitter = JobSubmitterPoller(config=config) # Actually run it jobSubmitter.algorithm() # Check to make sure we have running jobs nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0) # Jobs should be gone getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs * 2) result = getJobsAction.execute(state="SubmitFailed", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs * 2) # Now clean-up command = ["condor_rm", self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() del jobSubmitter return @attr("integration") def testF_OverloadTest(self): """ _OverloadTest_ Test and see what happens if you put in more jobs Then the sites can handle """ resourceControl = ResourceControl() for site in self.sites: resourceControl.insertThreshold(siteName=site, taskType="Silly", maxSlots=1) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) workloadName = "basicWorkload" myThread = threading.currentThread() workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 2 nJobs = 10 cacheDir = os.path.join(self.testDir, "CacheDir") jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), type="Silly", ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") jobSubmitter = JobSubmitterPoller(config=config) # Actually run it jobSubmitter.algorithm() # Should be one job for each site nSites = len(self.sites) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSites) getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state="Executing", jobType="Silly") self.assertEqual(len(result), nSites) result = getJobsAction.execute(state="Created", jobType="Silly") self.assertEqual(len(result), nJobs * nSubs - nSites) # Now clean-up command = ["condor_rm", self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() del jobSubmitter return @attr("integration") def testG_IndexErrorTest(self): """ _IndexErrorTest_ Check to see you get proper indexes for the jobPackages if you have more jobs then you normally run at once. """ workloadName = "basicWorkload" myThread = threading.currentThread() workload = self.createTestWorkload() config = self.getConfig() config.JobSubmitter.jobsPerWorker = 1 config.JobSubmitter.collectionSize = 1 changeState = ChangeState(config) nSubs = 1 nJobs = 10 cacheDir = os.path.join(self.testDir, "CacheDir") jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site="se.T2_US_UCSD", ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") # Do pre-submit check getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() if os.path.exists("CacheDir"): shutil.rmtree("CacheDir") shutil.copytree(self.testDir, "CacheDir") # Check that jobs are in the right state result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Check on the JDL submitFile = None for file in os.listdir(config.JobSubmitter.submitDir): if re.search("submit", file): submitFile = file self.assertTrue(submitFile != None) self.checkJDL(config=config, cacheDir=cacheDir, submitFile=submitFile, site="T2_US_UCSD", indexFlag=True) # Check to make sure we have running jobs nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs * nSubs) # Now clean-up command = ["condor_rm", self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() del jobSubmitter return
class JobSubmitterTest(EmulatedUnitTestCase): """ _JobSubmitterTest_ Test class for the JobSubmitterPoller """ def setUp(self): """ _setUp_ Standard setup: Now with 100% more couch """ super(JobSubmitterTest, self).setUp() self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema( customModules=["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"]) self.testInit.setupCouch("jobsubmitter_t/jobs", "JobDump") self.testInit.setupCouch("jobsubmitter_t/fwjrs", "FWJRDump") self.testInit.setupCouch("wmagent_summary_t", "WMStats") myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.baDaoFactory = DAOFactory(package="WMCore.BossAir", logger=myThread.logger, dbinterface=myThread.dbi) self.testDir = self.testInit.generateWorkDir() # Set heartbeat self.componentName = 'JobSubmitter' self.heartbeatAPI = HeartbeatAPI(self.componentName) self.heartbeatAPI.registerComponent() self.configFile = EmulatorSetup.setupWMAgentConfig() config = self.getConfig() myThread.logdbClient = MockLogDB(config.General.central_logdb_url, config.Agent.hostName, logger=None) return def tearDown(self): """ _tearDown_ Standard tearDown """ myThread = threading.currentThread() self.testInit.clearDatabase() self.testInit.delWorkDir() self.testInit.tearDownCouch() EmulatorSetup.deleteConfig(self.configFile) myThread.logdbClient = None return def setResourceThresholds(self, site, **options): """ _setResourceThresholds_ Utility to set resource thresholds """ if not options: options = {'state': 'Normal', 'runningSlots': 10, 'pendingSlots': 5, 'tasks': ['Processing', 'Merge'], 'Processing': {'pendingSlots': 5, 'runningSlots': 10}, 'Merge': {'pendingSlots': 2, 'runningSlots': 5}} resourceControl = ResourceControl() resourceControl.insertSite(siteName=site, pnn='se.%s' % (site), ceName=site, plugin="MockPlugin", pendingSlots=options['pendingSlots'], runningSlots=options['runningSlots'], cmsName=site) for task in options['tasks']: resourceControl.insertThreshold(siteName=site, taskType=task, maxSlots=options[task]['runningSlots'], pendingSlots=options[task]['pendingSlots']) if options.get('state'): resourceControl.changeSiteState(site, options.get('state')) return def createJobGroups(self, nSubs, nJobs, task, workloadSpec, site, taskType='Processing', name=None, wfPrio=1, changeState=None): """ _createJobGroups_ Creates a series of jobGroups for submissions changeState is an instance of the ChangeState class to make job status changes """ jobGroupList = [] if name is None: name = makeUUID() testWorkflow = Workflow(spec=workloadSpec, owner="tapas", name=name, task="basicWorkload/Production", priority=wfPrio) testWorkflow.create() # Create subscriptions for _ in range(nSubs): name = makeUUID() # Create Fileset, Subscription, jobGroup testFileset = Fileset(name=name) testFileset.create() testSubscription = Subscription(fileset=testFileset, workflow=testWorkflow, type=taskType, split_algo="FileBased") testSubscription.create() testJobGroup = JobGroup(subscription=testSubscription) testJobGroup.create() # Create jobs self.makeNJobs(name=name, task=task, nJobs=nJobs, jobGroup=testJobGroup, fileset=testFileset, sub=testSubscription.exists(), site=site) testFileset.commit() testJobGroup.commit() jobGroupList.append(testJobGroup) if changeState: for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') return jobGroupList def makeNJobs(self, name, task, nJobs, jobGroup, fileset, sub, site): """ _makeNJobs_ Make and return a WMBS Job and File This handles all those damn add-ons """ # Set the CacheDir cacheDir = os.path.join(self.testDir, 'CacheDir') for n in range(nJobs): # First make a file # site = self.sites[0] testFile = File(lfn="/singleLfn/%s/%s" % (name, n), size=1024, events=10) fileset.addFile(testFile) fileset.commit() location = None if isinstance(site, list): if len(site) > 0: location = site[0] else: location = site index = 0 for f in fileset.files: index += 1 testJob = Job(name='%s-%i' % (name, index)) testJob.addFile(f) testJob["location"] = location testJob["possiblePSN"] = set(site) if isinstance(site, list) else set([site]) testJob['task'] = task.getPathName() testJob['sandbox'] = task.data.input.sandbox testJob['spec'] = os.path.join(self.testDir, 'basicWorkload.pcl') testJob['mask']['FirstEvent'] = 101 testJob['priority'] = 101 testJob['numberOfCores'] = 1 jobCache = os.path.join(cacheDir, 'Sub_%i' % (sub), 'Job_%i' % (index)) os.makedirs(jobCache) testJob.create(jobGroup) testJob['cache_dir'] = jobCache testJob.save() jobGroup.add(testJob) output = open(os.path.join(jobCache, 'job.pkl'), 'w') pickle.dump(testJob, output) output.close() return testJob, testFile def getConfig(self): """ _getConfig_ Gets a basic config from default location """ config = self.testInit.getConfiguration() self.testInit.generateWorkDir(config) config.component_("Agent") config.Agent.WMSpecDirectory = self.testDir config.Agent.agentName = 'testAgent' config.Agent.hostName = 'testAgent' config.Agent.componentName = self.componentName config.Agent.useHeartbeat = False # First the general stuff config.section_("General") config.General.workDir = os.getenv("TESTDIR", self.testDir) config.General.central_logdb_url = "http://localhost/testlogdb" config.General.ReqMgr2ServiceURL = "http://localhost/reqmgr2" # Now the CoreDatabase information config.section_("CoreDatabase") config.CoreDatabase.connectUrl = os.getenv("DATABASE") config.CoreDatabase.socket = os.getenv("DBSOCK") # BossAir and MockPlugin configuration config.section_("BossAir") config.BossAir.pluginNames = ['MockPlugin'] # Here Test the CondorPlugin instead of MockPlugin # config.BossAir.pluginNames = ['CondorPlugin'] config.BossAir.pluginDir = 'WMCore.BossAir.Plugins' config.BossAir.nCondorProcesses = 1 config.BossAir.section_("MockPlugin") config.BossAir.MockPlugin.fakeReport = os.path.join(getTestBase(), 'WMComponent_t/JobSubmitter_t', "submit.sh") # JobSubmitter configuration config.component_("JobSubmitter") config.JobSubmitter.logLevel = 'DEBUG' config.JobSubmitter.maxThreads = 1 config.JobSubmitter.pollInterval = 10 config.JobSubmitter.submitScript = os.path.join(getTestBase(), 'WMComponent_t/JobSubmitter_t', 'submit.sh') config.JobSubmitter.componentDir = os.path.join(self.testDir, 'Components') config.JobSubmitter.workerThreads = 2 config.JobSubmitter.jobsPerWorker = 200 config.JobSubmitter.drainGraceTime = 2 # in seconds # JobStateMachine config.component_('JobStateMachine') config.JobStateMachine.couchurl = os.getenv('COUCHURL') config.JobStateMachine.couchDBName = "jobsubmitter_t" config.JobStateMachine.jobSummaryDBName = 'wmagent_summary_t' # Needed, because this is a test try: os.makedirs(config.JobSubmitter.componentDir) except: pass return config def createTestWorkload(self, name='workloadTest'): """ _createTestWorkload_ Creates a test workload for us to run on, hold the basic necessities. """ workload = testWorkload() taskMaker = TaskMaker(workload, os.path.join(self.testDir, name)) taskMaker.skipSubscription = True taskMaker.processWorkload() self.workloadSpecPath = os.path.join(self.testDir, name, "%s/WMSandbox/WMWorkload.pkl" % name) return workload def testA_BasicTest(self): """ Use the MockPlugin to create a simple test Check to see that all the jobs were "submitted", don't care about thresholds """ workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 2 nJobs = 20 site = "T2_US_UCSD" self.setResourceThresholds(site, pendingSlots=50, runningSlots=100, tasks=['Processing', 'Merge'], Processing={'pendingSlots': 50, 'runningSlots': 100}, Merge={'pendingSlots': 50, 'runningSlots': 100}) jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Do pre-submit check getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() # Check that jobs are in the right state result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Check assigned locations getLocationAction = self.daoFactory(classname="Jobs.GetLocation") for jobId in result: loc = getLocationAction.execute(jobid=jobId) self.assertEqual(loc, [['T2_US_UCSD']]) # Run another cycle, it shouldn't submit anything. There isn't anything to submit jobSubmitter.algorithm() result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) nSubs = 1 nJobs = 10 # Submit another 10 jobs jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, taskType="Merge") for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Check that the jobs are available for submission and run another cycle result = getJobsAction.execute(state='Created', jobType="Merge") self.assertEqual(len(result), nSubs * nJobs) jobSubmitter.algorithm() # Check that the last 10 jobs were submitted as well. result = getJobsAction.execute(state='Created', jobType="Merge") self.assertEqual(len(result), 0) result = getJobsAction.execute(state='Executing', jobType="Merge") self.assertEqual(len(result), nSubs * nJobs) return def testB_thresholdTest(self): """ _testB_thresholdTest_ Check that the threshold management is working, this requires checks on pending/running jobs globally at a site and per task/site """ workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 5 nJobs = 10 site = "T1_US_FNAL" self.setResourceThresholds(site, pendingSlots=50, runningSlots=220, tasks=['Processing', 'Merge'], Processing={'pendingSlots': 45, 'runningSlots': 200}, Merge={'pendingSlots': 10, 'runningSlots': 20, 'priority': 5}) # Always initialize the submitter after setting the sites, flaky! jobSubmitter = JobSubmitterPoller(config=config) jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Do pre-submit check getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) jobSubmitter.algorithm() # Check that jobs are in the right state, # here we are limited by the pending threshold for the Processing task (45) result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 5) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 45) # Check assigned locations getLocationAction = self.daoFactory(classname="Jobs.GetLocation") for jobId in result: loc = getLocationAction.execute(jobid=jobId) self.assertEqual(loc, [['T1_US_FNAL']]) # Run another cycle, it shouldn't submit anything. Jobs are still in pending jobSubmitter.algorithm() result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 5) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 45) # Now put 10 Merge jobs, only 5 can be submitted, there we hit the global pending threshold for the site nSubs = 1 nJobs = 10 jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, taskType='Merge') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() result = getJobsAction.execute(state='Created', jobType="Merge") self.assertEqual(len(result), 5) result = getJobsAction.execute(state='Executing', jobType="Merge") self.assertEqual(len(result), 5) result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 5) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 45) # Now let's test running thresholds # The scenario will be setup as follows: Move all current jobs as running # Create 300 Processing jobs and 300 merge jobs # Run 5 polling cycles, moving all pending jobs to running in between # Result is, merge is left at 30 running 0 pending and processing is left at 240 running 0 pending # Processing has 110 jobs in queue and Merge 280 # This tests all threshold dynamics including the prioritization of merge over processing nSubs = 1 nJobs = 300 jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site) jobGroupList.extend(self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, taskType='Merge')) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') getRunJobID = self.baDaoFactory(classname="LoadByWMBSID") setRunJobStatus = self.baDaoFactory(classname="SetStatus") for i in range(5): result = getJobsAction.execute(state='Executing') binds = [] for jobId in result: binds.append({'id': jobId, 'retry_count': 0}) runJobIds = getRunJobID.execute(binds) setRunJobStatus.execute([x['id'] for x in runJobIds], 'Running') jobSubmitter.algorithm() result = getJobsAction.execute(state='Executing', jobType='Processing') self.assertEqual(len(result), 240) result = getJobsAction.execute(state='Created', jobType='Processing') self.assertEqual(len(result), 110) result = getJobsAction.execute(state='Executing', jobType='Merge') self.assertEqual(len(result), 30) result = getJobsAction.execute(state='Created', jobType='Merge') self.assertEqual(len(result), 280) return def testC_prioTest(self): """ _testC_prioTest_ Test whether the correct job type, workflow and task id priorities are respected in the DAO """ workload1 = self.createTestWorkload(name='testWorkload1') workload2 = self.createTestWorkload(name='testWorkload2') workload3 = self.createTestWorkload(name='testWorkload3') workload4 = self.createTestWorkload(name='testWorkload4') config = self.getConfig() changeState = ChangeState(config) getJobsAction = self.daoFactory(classname="Jobs.ListForSubmitter") site = "T1_US_FNAL" self.setResourceThresholds(site, pendingSlots=1000, runningSlots=1000, tasks=['Processing', 'Merge', 'Production', 'Harvesting', 'LogCollect'], Processing={'pendingSlots': 1000, 'runningSlots': 1000}, Merge={'pendingSlots': 1000, 'runningSlots': 10000}, Production={'pendingSlots': 1000, 'runningSlots': 1000}, Harvesting={'pendingSlots': 1000, 'runningSlots': 1000}, LogCollect={'pendingSlots': 1000, 'runningSlots': 1000}) nSubs = 1 nJobs = 5 jobGroupList = [] jobGroup = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload1.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, name='OldestWorkflow') # task_id = 1 jobGroupList.extend(jobGroup) jobGroup = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload1.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, taskType='Merge') # task_id = 2 jobGroupList.extend(jobGroup) jobGroup = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload1.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, taskType='LogCollect') # task_id = 3 jobGroupList.extend(jobGroup) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # retrieve all 15 jobs created so far result = getJobsAction.execute(limitRows=100) self.assertItemsEqual([int(j['task_prio']) for j in result], [4] * 5 + [2] * 5 + [0] * 5) self.assertItemsEqual([int(j['wf_priority']) for j in result], [1] * 15) self.assertItemsEqual([int(j['task_id']) for j in result], [2] * 5 + [3] * 5 + [1] * 5) # now retrieve only 6 jobs (5 Merge and 1 LogCollect), wf prio=1 result = getJobsAction.execute(limitRows=6) self.assertItemsEqual([int(j['task_prio']) for j in result], [4] * 5 + [2] * 1) jobGroupList = [] jobGroup = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, wfPrio=2, task=workload2.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, taskType='Merge') # task_id = 4 jobGroupList.extend(jobGroup) jobGroup = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, wfPrio=3, task=workload3.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, taskType='Processing') # task_id = 5 jobGroupList.extend(jobGroup) jobGroup = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, wfPrio=3, task=workload3.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, taskType='LogCollect') # task_id = 6 jobGroupList.extend(jobGroup) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # retrieve all 30 jobs created so far result = getJobsAction.execute(limitRows=100) self.assertItemsEqual([int(j['task_prio']) for j in result], [4] * 10 + [2] * 10 + [0] * 10) # merge prio 2, merge prio 1, logCol prio 3, logCol prio 1, proc prio 3, proc prio 1 self.assertItemsEqual([int(j['wf_priority']) for j in result], [2] * 5 + [1] * 5 + [3] * 5 + [1] * 5 + [3] * 5 + [1] * 5) # merge id 4, merge id 2, logCol id 6, logCol id 3, proc id 5, proc id 1 self.assertItemsEqual([int(j['task_id']) for j in result], [4] * 5 + [2] * 5 + [6] * 5 + [3] * 5 + [5] * 5 + [1] * 5) jobGroupList = [] jobGroup = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, wfPrio=2, task=workload4.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, taskType='Merge') # task_id = 7 jobGroupList.extend(jobGroup) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # retrieve all 15 Merge jobs created so far result = getJobsAction.execute(limitRows=15) self.assertItemsEqual([int(j['task_prio']) for j in result], [4] * 15) # merge prio 2, merge prio 2, merge prio 1 self.assertItemsEqual([int(j['wf_priority']) for j in result], [2] * 10 + [1] * 5) # merge id 7, merge id 4, merge id 2 self.assertItemsEqual([int(j['task_id']) for j in result], [7] * 5 + [4] * 5 + [2] * 5) def testC_prioritization(self): """ _testC_prioritization_ Check that jobs are prioritized by job type and by oldest workflow """ workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 1 nJobs = 10 site = "T1_US_FNAL" self.setResourceThresholds(site, pendingSlots=10, runningSlots=10000, tasks=['Processing', 'Merge'], Processing={'pendingSlots': 50, 'runningSlots': 10000}, Merge={'pendingSlots': 10, 'runningSlots': 10000, 'priority': 5}) # Always initialize the submitter after setting the sites, flaky! jobSubmitter = JobSubmitterPoller(config=config) jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, name='OldestWorkflow') jobGroupList.extend(self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, taskType='Merge')) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() # Merge goes first getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Created', jobType="Merge") self.assertEqual(len(result), 0) result = getJobsAction.execute(state='Executing', jobType="Merge") self.assertEqual(len(result), 10) result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 10) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 0) # Create a newer workflow processing, and after some new jobs for an old workflow jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, name='OldestWorkflow') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, name='NewestWorkflow') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Move pending jobs to running getRunJobID = self.baDaoFactory(classname="LoadByWMBSID") setRunJobStatus = self.baDaoFactory(classname="SetStatus") for idx in range(2): result = getJobsAction.execute(state='Executing') binds = [] for jobId in result: binds.append({'id': jobId, 'retry_count': 0}) runJobIds = getRunJobID.execute(binds) setRunJobStatus.execute([x['id'] for x in runJobIds], 'Running') # Run again on created workflows jobSubmitter.algorithm() result = getJobsAction.execute(state='Created', jobType="Merge") self.assertEqual(len(result), 0) result = getJobsAction.execute(state='Executing', jobType="Merge") self.assertEqual(len(result), 10) result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 30 - (idx + 1) * 10) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), (idx + 1) * 10) # Check that older workflow goes first even with newer jobs getWorkflowAction = self.daoFactory(classname="Jobs.GetWorkflowTask") workflows = getWorkflowAction.execute(result) for workflow in workflows: self.assertEqual(workflow['name'], 'OldestWorkflow') return def testD_SubmitFailed(self): """ _testD_SubmitFailed_ Check if jobs without a possible site to run at go to SubmitFailed """ workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 2 nJobs = 10 jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), site=[], workloadSpec=self.workloadSpecPath) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() # Jobs should go to submit failed getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='SubmitFailed', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) return def testE_SiteModesTest(self): """ _testE_SiteModesTest_ Test the behavior of the submitter in response to the different states of the sites """ workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 1 nJobs = 20 sites = ['T2_US_Florida', 'T2_RU_INR', 'T3_CO_Uniandes', 'T1_US_FNAL'] for site in sites: self.setResourceThresholds(site, pendingSlots=10, runningSlots=999999, tasks=['Processing', 'Merge'], Processing={'pendingSlots': 10, 'runningSlots': 999999}, Merge={'pendingSlots': 10, 'runningSlots': 999999, 'priority': 5}) myResourceControl = ResourceControl(config) myResourceControl.changeSiteState('T2_US_Florida', 'Draining') # First test that we prefer Normal over drain, and T1 over T2/T3 jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, site=[x for x in sites], task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) # Actually run it jobSubmitter.algorithm() getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # All jobs should be at either FNAL, Taiwan or Uniandes. It's a random selection # Check assigned locations getLocationAction = self.daoFactory(classname="Jobs.GetLocation") locationDict = getLocationAction.execute([{'jobid': x} for x in result]) for entry in locationDict: loc = entry['site_name'] self.assertNotEqual(loc, 'T2_US_Florida') # Now set everything to down, check we don't submit anything for site in sites: myResourceControl.changeSiteState(site, 'Down') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, site=[x for x in sites], task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() # Nothing is submitted despite the empty slots at Uniandes and Florida result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Now set everything to Drain and create Merge jobs. Those should be submitted for site in sites: myResourceControl.changeSiteState(site, 'Draining') nSubsMerge = 1 nJobsMerge = 5 jobGroupList = self.createJobGroups(nSubs=nSubsMerge, nJobs=nJobsMerge, site=[x for x in sites], task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, taskType='Merge') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() result = getJobsAction.execute(state='Executing', jobType='Merge') self.assertEqual(len(result), nSubsMerge * nJobsMerge) # Now set everything to Aborted, and create Merge jobs. Those should fail # since the can only run at one place for site in sites: myResourceControl.changeSiteState(site, 'Aborted') nSubsMerge = 1 nJobsMerge = 5 jobGroupList = self.createJobGroups(nSubs=nSubsMerge, nJobs=nJobsMerge, site=[x for x in sites], task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, taskType='Merge') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() result = getJobsAction.execute(state='SubmitFailed', jobType='Merge') self.assertEqual(len(result), nSubsMerge * nJobsMerge) result = getJobsAction.execute(state='Executing', jobType='Processing') self.assertEqual(len(result), nSubs * nJobs) return def testJobSiteDrain(self): """ _testJobSiteDrain_ Test the behavior of jobs pending to a single site that is in drain mode """ workload = self.createTestWorkload() config = self.getConfig() jobSubmitter = JobSubmitterPoller(config=config) myResourceControl = ResourceControl(config) changeState = ChangeState(config) getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") nSubs = 1 nJobs = 30 site = 'T2_US_Nebraska' self.setResourceThresholds(site, pendingSlots=100, runningSlots=100, tasks=['Processing', 'Merge'], Processing={'pendingSlots': 10, 'runningSlots': 10}, Merge={'pendingSlots': 10, 'runningSlots': 10, 'priority': 5}) jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, site=[site], task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # submit first 10 jobs jobSubmitter.algorithm() result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 10) myResourceControl.changeSiteState(site, 'Draining') # site is now in drain, so don't submit anything jobSubmitter.algorithm() # jobs were supposed to get killed, but I guess the MockPlugin doesnt do anything result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 10) result = getJobsAction.execute(state='created', jobType="Processing") self.assertEqual(len(result), 20) result = getJobsAction.execute(state='submitfailed', jobType="Processing") self.assertEqual(len(result), 0) # make sure the drain grace period expires... time.sleep(3) jobSubmitter.algorithm() result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 10) # the remaining jobs should have gone to submitfailed by now result = getJobsAction.execute(state='submitfailed', jobType="Processing") self.assertEqual(len(result), 20) result = getJobsAction.execute(state='created', jobType="Processing") self.assertEqual(len(result), 0) @attr('integration') def testF_PollerProfileTest(self): """ _testF_PollerProfileTest_ Submit a lot of jobs and test how long it takes for them to actually be submitted """ workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 100 nJobs = 100 site = "T1_US_FNAL" self.setResourceThresholds(site, pendingSlots=20000, runningSlots=999999, tasks=['Processing', 'Merge'], Processing={'pendingSlots': 10000, 'runningSlots': 999999}, Merge={'pendingSlots': 10000, 'runningSlots': 999999, 'priority': 5}) # Always initialize the submitter after setting the sites, flaky! JobSubmitterPoller(config=config) jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site) jobGroupList.extend(self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, taskType='Merge')) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Actually run it startTime = time.time() cProfile.runctx("JobSubmitterPoller(config=config).algorithm()", globals(), locals(), filename="testStats.stat") stopTime = time.time() print("Job took %f seconds to complete" % (stopTime - startTime)) p = pstats.Stats('testStats.stat') p.sort_stats('cumulative') p.print_stats() return @attr('integration') def testMemoryProfile(self): """ _testMemoryProfile_ Creates 20k jobs and keep refreshing the cache and submitting them between the components cycle Example using memory_profiler library, unfortunately the source code has to be updated with decorators. NOTE: Never run it on jenkins """ workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) # myResourceControl = ResourceControl(config) nSubs = 20 nJobs = 100 sites = ['T2_US_Florida', 'T2_RU_INR', 'T3_CO_Uniandes', 'T1_US_FNAL'] allSites = CRIC().PSNtoPNNMap('*') for site in allSites: self.setResourceThresholds(site, pendingSlots=20000, runningSlots=999999, tasks=['Processing', 'Merge'], Processing={'pendingSlots': 10000, 'runningSlots': 999999}, Merge={'pendingSlots': 10000, 'runningSlots': 999999, 'priority': 5}) # Always initialize the submitter after setting the sites, flaky! jobSubmitter = JobSubmitterPoller(config=config) self.createJobGroups(nSubs=nSubs, nJobs=nJobs, wfPrio=10, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=[x for x in sites], changeState=changeState) # Actually run it jobSubmitter.algorithm() # cycle 1 self.createJobGroups(nSubs=nSubs, nJobs=nJobs, wfPrio=10, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=[x for x in sites], changeState=changeState) # myResourceControl.changeSiteState('T2_US_Florida', 'Draining') jobSubmitter.algorithm() # cycle 2 self.createJobGroups(nSubs=nSubs, nJobs=nJobs, wfPrio=10, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=[x for x in sites], changeState=changeState) # myResourceControl.changeSiteState('T2_RU_INR', 'Draining') jobSubmitter.algorithm() # cycle 3 self.createJobGroups(nSubs=nSubs, nJobs=nJobs, wfPrio=10, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=[x for x in sites], changeState=changeState) # myResourceControl.changeSiteState('T3_CO_Uniandes', 'Draining') jobSubmitter.algorithm() # cycle 4 # myResourceControl.changeSiteState('T2_RU_INR', 'Normal') jobSubmitter.algorithm() # cycle 5 # myResourceControl.changeSiteState('T2_US_Florida', 'Normal') jobSubmitter.algorithm() # cycle 6 # myResourceControl.changeSiteState('T2_RU_INR', 'Normal') jobSubmitter.algorithm() # cycle 7 # myResourceControl.changeSiteState('T3_CO_Uniandes', 'Normal') jobSubmitter.algorithm() # cycle 8 jobSubmitter.algorithm() # cycle 9, nothing to submit return
def setUp(self): """ setup for test. """ myThread = threading.currentThread() self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.tearDown() self.testInit.setSchema( customModules=["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"], useDefault=False, ) self.testInit.setupCouch("bossair_t/jobs", "JobDump") self.testInit.setupCouch("bossair_t/fwjrs", "FWJRDump") self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") # Create sites in resourceControl resourceControl = ResourceControl() for site in self.sites: resourceControl.insertSite( siteName=site, pnn="se.%s" % (site), cmsName=site, ceName=site, plugin="CondorPlugin", pendingSlots=1000, runningSlots=2000, ) resourceControl.insertThreshold(siteName=site, taskType="Processing", maxSlots=1000, pendingSlots=1000) resourceControl.insertSite( siteName="Xanadu", pnn="se.Xanadu", cmsName=site, ceName="Xanadu", plugin="TestPlugin" ) resourceControl.insertThreshold(siteName="Xanadu", taskType="Processing", maxSlots=10000, pendingSlots=10000) resourceControl.insertSite( siteName="jade-cms.hip.fi", pnn="madhatter.csc.fi", cmsName=site, ceName="jade-cms.hip.fi", plugin="ARCPlugin", ) resourceControl.insertThreshold( siteName="jade-cms.hip.fi", taskType="Processing", maxSlots=100, pendingSlots=100 ) # using this for glite submissions resourceControl.insertSite( siteName="grid-ce-01.ba.infn.it", pnn="storm-se-01.ba.infn.it", cmsName=site, ceName="grid-ce-01.ba.infn.it", plugin="gLitePlugin", ) resourceControl.insertThreshold( siteName="grid-ce-01.ba.infn.it", taskType="Processing", maxSlots=50, pendingSlots=50 ) # Create user newuser = self.daoFactory(classname="Users.New") newuser.execute(dn="tapas", group_name="phgroup", role_name="cmsrole") # We actually need the user name self.user = getpass.getuser() # Change this to the working dir to keep track of error and log files from condor self.testDir = self.testInit.generateWorkDir() # Set heartbeat componentName = "test" self.heartbeatAPI = HeartbeatAPI(componentName) self.heartbeatAPI.registerComponent() componentName = "JobTracker" self.heartbeatAPI2 = HeartbeatAPI(componentName) self.heartbeatAPI2.registerComponent() return
class JobSubmitterTest(EmulatedUnitTestCase): """ _JobSubmitterTest_ Test class for the JobSubmitterPoller """ def setUp(self): """ _setUp_ Standard setup: Now with 100% more couch """ super(JobSubmitterTest, self).setUp() self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules=[ "WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database" ]) self.testInit.setupCouch("jobsubmitter_t/jobs", "JobDump") self.testInit.setupCouch("jobsubmitter_t/fwjrs", "FWJRDump") self.testInit.setupCouch("wmagent_summary_t", "WMStats") myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.baDaoFactory = DAOFactory(package="WMCore.BossAir", logger=myThread.logger, dbinterface=myThread.dbi) self.testDir = self.testInit.generateWorkDir() # Set heartbeat self.componentName = 'JobSubmitter' self.heartbeatAPI = HeartbeatAPI(self.componentName) self.heartbeatAPI.registerComponent() self.configFile = EmulatorSetup.setupWMAgentConfig() config = self.getConfig() myThread.logdbClient = MockLogDB(config.General.central_logdb_url, config.Agent.hostName, logger=None) return def tearDown(self): """ _tearDown_ Standard tearDown """ myThread = threading.currentThread() self.testInit.clearDatabase() self.testInit.delWorkDir() self.testInit.tearDownCouch() EmulatorSetup.deleteConfig(self.configFile) myThread.logdbClient = None return def setResourceThresholds(self, site, **options): """ _setResourceThresholds_ Utility to set resource thresholds """ if not options: options = { 'state': 'Normal', 'runningSlots': 10, 'pendingSlots': 5, 'tasks': ['Processing', 'Merge'], 'Processing': { 'pendingSlots': 5, 'runningSlots': 10 }, 'Merge': { 'pendingSlots': 2, 'runningSlots': 5 } } resourceControl = ResourceControl() resourceControl.insertSite(siteName=site, pnn='se.%s' % (site), ceName=site, plugin="MockPlugin", pendingSlots=options['pendingSlots'], runningSlots=options['runningSlots'], cmsName=site) for task in options['tasks']: resourceControl.insertThreshold( siteName=site, taskType=task, maxSlots=options[task]['runningSlots'], pendingSlots=options[task]['pendingSlots']) if options.get('state'): resourceControl.changeSiteState(site, options.get('state')) return def createJobGroups(self, nSubs, nJobs, task, workloadSpec, site, taskType='Processing', name=None): """ _createJobGroups_ Creates a series of jobGroups for submissions """ jobGroupList = [] if name is None: name = makeUUID() testWorkflow = Workflow(spec=workloadSpec, owner="tapas", name=name, task="basicWorkload/Production") testWorkflow.create() # Create subscriptions for _ in range(nSubs): name = makeUUID() # Create Fileset, Subscription, jobGroup testFileset = Fileset(name=name) testFileset.create() testSubscription = Subscription(fileset=testFileset, workflow=testWorkflow, type=taskType, split_algo="FileBased") testSubscription.create() testJobGroup = JobGroup(subscription=testSubscription) testJobGroup.create() # Create jobs self.makeNJobs(name=name, task=task, nJobs=nJobs, jobGroup=testJobGroup, fileset=testFileset, sub=testSubscription.exists(), site=site) testFileset.commit() testJobGroup.commit() jobGroupList.append(testJobGroup) return jobGroupList def makeNJobs(self, name, task, nJobs, jobGroup, fileset, sub, site): """ _makeNJobs_ Make and return a WMBS Job and File This handles all those damn add-ons """ # Set the CacheDir cacheDir = os.path.join(self.testDir, 'CacheDir') for n in range(nJobs): # First make a file # site = self.sites[0] testFile = File(lfn="/singleLfn/%s/%s" % (name, n), size=1024, events=10) fileset.addFile(testFile) fileset.commit() location = None if isinstance(site, list): if len(site) > 0: location = site[0] else: location = site index = 0 for f in fileset.files: index += 1 testJob = Job(name='%s-%i' % (name, index)) testJob.addFile(f) testJob["location"] = location testJob["possiblePSN"] = set(site) if isinstance( site, list) else set([site]) testJob['task'] = task.getPathName() testJob['sandbox'] = task.data.input.sandbox testJob['spec'] = os.path.join(self.testDir, 'basicWorkload.pcl') testJob['mask']['FirstEvent'] = 101 testJob['priority'] = 101 testJob['numberOfCores'] = 1 jobCache = os.path.join(cacheDir, 'Sub_%i' % (sub), 'Job_%i' % (index)) os.makedirs(jobCache) testJob.create(jobGroup) testJob['cache_dir'] = jobCache testJob.save() jobGroup.add(testJob) output = open(os.path.join(jobCache, 'job.pkl'), 'w') pickle.dump(testJob, output) output.close() return testJob, testFile def getConfig(self): """ _getConfig_ Gets a basic config from default location """ config = self.testInit.getConfiguration() self.testInit.generateWorkDir(config) config.component_("Agent") config.Agent.WMSpecDirectory = self.testDir config.Agent.agentName = 'testAgent' config.Agent.hostName = 'testAgent' config.Agent.componentName = self.componentName config.Agent.useHeartbeat = False # First the general stuff config.section_("General") config.General.workDir = os.getenv("TESTDIR", self.testDir) config.General.central_logdb_url = "http://localhost/testlogdb" # Now the CoreDatabase information config.section_("CoreDatabase") config.CoreDatabase.connectUrl = os.getenv("DATABASE") config.CoreDatabase.socket = os.getenv("DBSOCK") # BossAir and MockPlugin configuration config.section_("BossAir") config.BossAir.pluginNames = ['MockPlugin'] # Here Test the CondorPlugin instead of MockPlugin # config.BossAir.pluginNames = ['CondorPlugin'] config.BossAir.pluginDir = 'WMCore.BossAir.Plugins' config.BossAir.nCondorProcesses = 1 config.BossAir.section_("MockPlugin") config.BossAir.MockPlugin.fakeReport = os.path.join( getTestBase(), 'WMComponent_t/JobSubmitter_t', "submit.sh") # JobSubmitter configuration config.component_("JobSubmitter") config.JobSubmitter.logLevel = 'DEBUG' config.JobSubmitter.maxThreads = 1 config.JobSubmitter.pollInterval = 10 config.JobSubmitter.submitScript = os.path.join( getTestBase(), 'WMComponent_t/JobSubmitter_t', 'submit.sh') config.JobSubmitter.componentDir = os.path.join( self.testDir, 'Components') config.JobSubmitter.workerThreads = 2 config.JobSubmitter.jobsPerWorker = 200 # JobStateMachine config.component_('JobStateMachine') config.JobStateMachine.couchurl = os.getenv('COUCHURL') config.JobStateMachine.couchDBName = "jobsubmitter_t" config.JobStateMachine.jobSummaryDBName = 'wmagent_summary_t' # TaskArchive setup (JobSubmitter needs this) config.component_("TaskArchiver") config.TaskArchiver.ReqMgr2ServiceURL = "https://cmsweb-dev.cern.ch/reqmgr2" # Needed, because this is a test try: os.makedirs(config.JobSubmitter.componentDir) except: pass return config def createTestWorkload(self): """ _createTestWorkload_ Creates a test workload for us to run on, hold the basic necessities. """ workload = testWorkload() taskMaker = TaskMaker(workload, os.path.join(self.testDir, 'workloadTest')) taskMaker.skipSubscription = True taskMaker.processWorkload() self.workloadSpecPath = os.path.join( self.testDir, 'workloadTest', "TestWorkload/WMSandbox/WMWorkload.pkl") return workload def testA_BasicTest(self): """ Use the MockPlugin to create a simple test Check to see that all the jobs were "submitted", don't care about thresholds """ workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 2 nJobs = 20 site = "T2_US_UCSD" self.setResourceThresholds(site, pendingSlots=50, runningSlots=100, tasks=['Processing', 'Merge'], Processing={ 'pendingSlots': 50, 'runningSlots': 100 }, Merge={ 'pendingSlots': 50, 'runningSlots': 100 }) jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Do pre-submit check getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() # Check that jobs are in the right state result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Check assigned locations getLocationAction = self.daoFactory(classname="Jobs.GetLocation") for jobId in result: loc = getLocationAction.execute(jobid=jobId) self.assertEqual(loc, [['T2_US_UCSD']]) # Run another cycle, it shouldn't submit anything. There isn't anything to submit jobSubmitter.algorithm() result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) nSubs = 1 nJobs = 10 # Submit another 10 jobs jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, taskType="Merge") for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Check that the jobs are available for submission and run another cycle result = getJobsAction.execute(state='Created', jobType="Merge") self.assertEqual(len(result), nSubs * nJobs) jobSubmitter.algorithm() # Check that the last 10 jobs were submitted as well. result = getJobsAction.execute(state='Created', jobType="Merge") self.assertEqual(len(result), 0) result = getJobsAction.execute(state='Executing', jobType="Merge") self.assertEqual(len(result), nSubs * nJobs) return def testB_thresholdTest(self): """ _testB_thresholdTest_ Check that the threshold management is working, this requires checks on pending/running jobs globally at a site and per task/site """ workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 5 nJobs = 10 site = "T1_US_FNAL" self.setResourceThresholds(site, pendingSlots=50, runningSlots=220, tasks=['Processing', 'Merge'], Processing={ 'pendingSlots': 45, 'runningSlots': 200 }, Merge={ 'pendingSlots': 10, 'runningSlots': 20, 'priority': 5 }) # Always initialize the submitter after setting the sites, flaky! jobSubmitter = JobSubmitterPoller(config=config) jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Do pre-submit check getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) jobSubmitter.algorithm() # Check that jobs are in the right state, # here we are limited by the pending threshold for the Processing task (45) result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 5) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 45) # Check assigned locations getLocationAction = self.daoFactory(classname="Jobs.GetLocation") for jobId in result: loc = getLocationAction.execute(jobid=jobId) self.assertEqual(loc, [['T1_US_FNAL']]) # Run another cycle, it shouldn't submit anything. Jobs are still in pending jobSubmitter.algorithm() result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 5) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 45) # Now put 10 Merge jobs, only 5 can be submitted, there we hit the global pending threshold for the site nSubs = 1 nJobs = 10 jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, taskType='Merge') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() result = getJobsAction.execute(state='Created', jobType="Merge") self.assertEqual(len(result), 5) result = getJobsAction.execute(state='Executing', jobType="Merge") self.assertEqual(len(result), 5) result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 5) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 45) # Now let's test running thresholds # The scenario will be setup as follows: Move all current jobs as running # Create 300 Processing jobs and 300 merge jobs # Run 5 polling cycles, moving all pending jobs to running in between # Result is, merge is left at 30 running 0 pending and processing is left at 240 running 0 pending # Processing has 110 jobs in queue and Merge 280 # This tests all threshold dynamics including the prioritization of merge over processing nSubs = 1 nJobs = 300 jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site) jobGroupList.extend( self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, taskType='Merge')) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') getRunJobID = self.baDaoFactory(classname="LoadByWMBSID") setRunJobStatus = self.baDaoFactory(classname="SetStatus") for i in range(5): result = getJobsAction.execute(state='Executing') binds = [] for jobId in result: binds.append({'id': jobId, 'retry_count': 0}) runJobIds = getRunJobID.execute(binds) setRunJobStatus.execute([x['id'] for x in runJobIds], 'Running') jobSubmitter.algorithm() result = getJobsAction.execute(state='Executing', jobType='Processing') self.assertEqual(len(result), 240) result = getJobsAction.execute(state='Created', jobType='Processing') self.assertEqual(len(result), 110) result = getJobsAction.execute(state='Executing', jobType='Merge') self.assertEqual(len(result), 30) result = getJobsAction.execute(state='Created', jobType='Merge') self.assertEqual(len(result), 280) return def testC_prioritization(self): """ _testC_prioritization_ Check that jobs are prioritized by job type and by oldest workflow """ workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 1 nJobs = 10 site = "T1_US_FNAL" self.setResourceThresholds(site, pendingSlots=10, runningSlots=10000, tasks=['Processing', 'Merge'], Processing={ 'pendingSlots': 50, 'runningSlots': 10000 }, Merge={ 'pendingSlots': 10, 'runningSlots': 10000, 'priority': 5 }) # Always initialize the submitter after setting the sites, flaky! jobSubmitter = JobSubmitterPoller(config=config) jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, name='OldestWorkflow') jobGroupList.extend( self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, taskType='Merge')) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() # Merge goes first getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Created', jobType="Merge") self.assertEqual(len(result), 0) result = getJobsAction.execute(state='Executing', jobType="Merge") self.assertEqual(len(result), 10) result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 10) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 0) # Create a newer workflow processing, and after some new jobs for an old workflow jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, name='OldestWorkflow') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, name='NewestWorkflow') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Move pending jobs to running getRunJobID = self.baDaoFactory(classname="LoadByWMBSID") setRunJobStatus = self.baDaoFactory(classname="SetStatus") for idx in range(2): result = getJobsAction.execute(state='Executing') binds = [] for jobId in result: binds.append({'id': jobId, 'retry_count': 0}) runJobIds = getRunJobID.execute(binds) setRunJobStatus.execute([x['id'] for x in runJobIds], 'Running') # Run again on created workflows jobSubmitter.algorithm() result = getJobsAction.execute(state='Created', jobType="Merge") self.assertEqual(len(result), 0) result = getJobsAction.execute(state='Executing', jobType="Merge") self.assertEqual(len(result), 10) result = getJobsAction.execute(state='Created', jobType="Processing") self.assertEqual(len(result), 30 - (idx + 1) * 10) result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), (idx + 1) * 10) # Check that older workflow goes first even with newer jobs getWorkflowAction = self.daoFactory( classname="Jobs.GetWorkflowTask") workflows = getWorkflowAction.execute(result) for workflow in workflows: self.assertEqual(workflow['name'], 'OldestWorkflow') return def testD_SubmitFailed(self): """ _testD_SubmitFailed_ Check if jobs without a possible site to run at go to SubmitFailed """ workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 2 nJobs = 10 jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), site=[], workloadSpec=self.workloadSpecPath) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() # Jobs should go to submit failed getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='SubmitFailed', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) return def testE_SiteModesTest(self): """ _testE_SiteModesTest_ Test the behavior of the submitter in response to the different states of the sites """ workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 1 nJobs = 20 sites = [ 'T2_US_Florida', 'T2_TW_Taiwan', 'T3_CO_Uniandes', 'T1_US_FNAL' ] for site in sites: self.setResourceThresholds(site, pendingSlots=10, runningSlots=-1, tasks=['Processing', 'Merge'], Processing={ 'pendingSlots': 10, 'runningSlots': -1 }, Merge={ 'pendingSlots': 10, 'runningSlots': -1, 'priority': 5 }) myResourceControl = ResourceControl(config) myResourceControl.changeSiteState('T2_US_Florida', 'Draining') # First test that we prefer Normal over drain, and T1 over T2/T3 jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, site=[x for x in sites], task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) # Actually run it jobSubmitter.algorithm() getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # All jobs should be at either FNAL, Taiwan or Uniandes. It's a random selection # Check assigned locations getLocationAction = self.daoFactory(classname="Jobs.GetLocation") locationDict = getLocationAction.execute([{ 'jobid': x } for x in result]) for entry in locationDict: loc = entry['site_name'] self.assertNotEqual(loc, 'T2_US_Florida') # Now set everything to down, check we don't submit anything for site in sites: myResourceControl.changeSiteState(site, 'Down') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, site=[x for x in sites], task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() # Nothing is submitted despite the empty slots at Uniandes and Florida result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Now set everything to Drain and create Merge jobs. Those should be submitted for site in sites: myResourceControl.changeSiteState(site, 'Draining') nSubsMerge = 1 nJobsMerge = 5 jobGroupList = self.createJobGroups(nSubs=nSubsMerge, nJobs=nJobsMerge, site=[x for x in sites], task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, taskType='Merge') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() result = getJobsAction.execute(state='Executing', jobType='Merge') self.assertEqual(len(result), nSubsMerge * nJobsMerge) # Now set everything to Aborted, and create Merge jobs. Those should fail # since the can only run at one place for site in sites: myResourceControl.changeSiteState(site, 'Aborted') nSubsMerge = 1 nJobsMerge = 5 jobGroupList = self.createJobGroups(nSubs=nSubsMerge, nJobs=nJobsMerge, site=[x for x in sites], task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, taskType='Merge') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter.algorithm() result = getJobsAction.execute(state='SubmitFailed', jobType='Merge') self.assertEqual(len(result), nSubsMerge * nJobsMerge) result = getJobsAction.execute(state='Executing', jobType='Processing') self.assertEqual(len(result), nSubs * nJobs) return @attr('integration') def testF_PollerProfileTest(self): """ _testF_PollerProfileTest_ Submit a lot of jobs and test how long it takes for them to actually be submitted """ workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 100 nJobs = 100 site = "T1_US_FNAL" self.setResourceThresholds(site, pendingSlots=20000, runningSlots=-1, tasks=['Processing', 'Merge'], Processing={ 'pendingSlots': 10000, 'runningSlots': -1 }, Merge={ 'pendingSlots': 10000, 'runningSlots': -1, 'priority': 5 }) # Always initialize the submitter after setting the sites, flaky! JobSubmitterPoller(config=config) jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site) jobGroupList.extend( self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=self.workloadSpecPath, site=site, taskType='Merge')) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') # Actually run it startTime = time.time() cProfile.runctx("JobSubmitterPoller(config=config).algorithm()", globals(), locals(), filename="testStats.stat") stopTime = time.time() print("Job took %f seconds to complete" % (stopTime - startTime)) p = pstats.Stats('testStats.stat') p.sort_stats('cumulative') p.print_stats() return
class JobSubmitterTest(unittest.TestCase): """ _JobSubmitterTest_ Test class for the JobSubmitterPoller """ def setUp(self): """ _setUp_ Standard setup: Now with 100% more couch """ self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema( customModules=["WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database"] ) self.testInit.setupCouch("jobsubmitter_t/jobs", "JobDump") self.testInit.setupCouch("jobsubmitter_t/fwjrs", "FWJRDump") self.testInit.setupCouch("wmagent_summary_t", "WMStats") myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.baDaoFactory = DAOFactory(package="WMCore.BossAir", logger=myThread.logger, dbinterface=myThread.dbi) self.testDir = self.testInit.generateWorkDir() # Set heartbeat self.componentName = "JobSubmitter" self.heartbeatAPI = HeartbeatAPI(self.componentName) self.heartbeatAPI.registerComponent() self.configFile = EmulatorSetup.setupWMAgentConfig() return def tearDown(self): """ _tearDown_ Standard tearDown """ self.testInit.clearDatabase() self.testInit.delWorkDir() self.testInit.tearDownCouch() EmulatorSetup.deleteConfig(self.configFile) return def setResourceThresholds(self, site, **options): """ _setResourceThresholds_ Utility to set resource thresholds """ if not options: options = { "state": "Normal", "runningSlots": 10, "pendingSlots": 5, "tasks": ["Processing", "Merge"], "Processing": {"pendingSlots": 5, "runningSlots": 10}, "Merge": {"pendingSlots": 2, "runningSlots": 5}, } resourceControl = ResourceControl() resourceControl.insertSite( siteName=site, pnn="se.%s" % (site), ceName=site, plugin="MockPlugin", pendingSlots=options["pendingSlots"], runningSlots=options["runningSlots"], cmsName=site, ) for task in options["tasks"]: resourceControl.insertThreshold( siteName=site, taskType=task, maxSlots=options[task]["runningSlots"], pendingSlots=options[task]["pendingSlots"], ) if options.get("state"): resourceControl.changeSiteState(site, options.get("state")) return def createJobGroups(self, nSubs, nJobs, task, workloadSpec, site, taskType="Processing", name=None): """ _createJobGroups_ Creates a series of jobGroups for submissions """ jobGroupList = [] if name is None: name = makeUUID() testWorkflow = Workflow(spec=workloadSpec, owner="tapas", name=name, task="basicWorkload/Production") testWorkflow.create() # Create subscriptions for _ in range(nSubs): name = makeUUID() # Create Fileset, Subscription, jobGroup testFileset = Fileset(name=name) testFileset.create() testSubscription = Subscription( fileset=testFileset, workflow=testWorkflow, type=taskType, split_algo="FileBased" ) testSubscription.create() testJobGroup = JobGroup(subscription=testSubscription) testJobGroup.create() # Create jobs self.makeNJobs( name=name, task=task, nJobs=nJobs, jobGroup=testJobGroup, fileset=testFileset, sub=testSubscription.exists(), site=site, ) testFileset.commit() testJobGroup.commit() jobGroupList.append(testJobGroup) return jobGroupList def makeNJobs(self, name, task, nJobs, jobGroup, fileset, sub, site): """ _makeNJobs_ Make and return a WMBS Job and File This handles all those damn add-ons """ # Set the CacheDir cacheDir = os.path.join(self.testDir, "CacheDir") for n in range(nJobs): # First make a file # site = self.sites[0] testFile = File(lfn="/singleLfn/%s/%s" % (name, n), size=1024, events=10) fileset.addFile(testFile) fileset.commit() location = None if isinstance(site, list): if len(site) > 0: location = site[0] else: location = site index = 0 for f in fileset.files: index += 1 testJob = Job(name="%s-%i" % (name, index)) testJob.addFile(f) testJob["location"] = location testJob["possiblePSN"] = set(site) if isinstance(site, list) else set([site]) testJob["task"] = task.getPathName() testJob["sandbox"] = task.data.input.sandbox testJob["spec"] = os.path.join(self.testDir, "basicWorkload.pcl") testJob["mask"]["FirstEvent"] = 101 testJob["priority"] = 101 testJob["numberOfCores"] = 1 jobCache = os.path.join(cacheDir, "Sub_%i" % (sub), "Job_%i" % (index)) os.makedirs(jobCache) testJob.create(jobGroup) testJob["cache_dir"] = jobCache testJob.save() jobGroup.add(testJob) output = open(os.path.join(jobCache, "job.pkl"), "w") pickle.dump(testJob, output) output.close() return testJob, testFile def getConfig(self): """ _getConfig_ Gets a basic config from default location """ config = self.testInit.getConfiguration() self.testInit.generateWorkDir(config) config.component_("Agent") config.Agent.WMSpecDirectory = self.testDir config.Agent.agentName = "testAgent" config.Agent.componentName = self.componentName config.Agent.useHeartbeat = False # First the general stuff config.section_("General") config.General.workDir = os.getenv("TESTDIR", self.testDir) # Now the CoreDatabase information config.section_("CoreDatabase") config.CoreDatabase.connectUrl = os.getenv("DATABASE") config.CoreDatabase.socket = os.getenv("DBSOCK") # BossAir and MockPlugin configuration config.section_("BossAir") config.BossAir.pluginNames = ["MockPlugin"] # Here Test the CondorPlugin instead of MockPlugin # config.BossAir.pluginNames = ['CondorPlugin'] config.BossAir.pluginDir = "WMCore.BossAir.Plugins" config.BossAir.multicoreTaskTypes = ["MultiProcessing", "MultiProduction"] config.BossAir.nCondorProcesses = 1 config.BossAir.section_("MockPlugin") config.BossAir.MockPlugin.fakeReport = os.path.join(getTestBase(), "WMComponent_t/JobSubmitter_t", "submit.sh") # JobSubmitter configuration config.component_("JobSubmitter") config.JobSubmitter.logLevel = "DEBUG" config.JobSubmitter.maxThreads = 1 config.JobSubmitter.pollInterval = 10 config.JobSubmitter.submitScript = os.path.join(getTestBase(), "WMComponent_t/JobSubmitter_t", "submit.sh") config.JobSubmitter.componentDir = os.path.join(self.testDir, "Components") config.JobSubmitter.workerThreads = 2 config.JobSubmitter.jobsPerWorker = 200 # JobStateMachine config.component_("JobStateMachine") config.JobStateMachine.couchurl = os.getenv("COUCHURL") config.JobStateMachine.couchDBName = "jobsubmitter_t" config.JobStateMachine.jobSummaryDBName = "wmagent_summary_t" # Needed, because this is a test os.makedirs(config.JobSubmitter.componentDir) return config def createTestWorkload(self, workloadName="Tier1ReReco"): """ _createTestWorkload_ Creates a test workload for us to run on, hold the basic necessities. """ workload = testWorkload(workloadName) taskMaker = TaskMaker(workload, os.path.join(self.testDir, "workloadTest")) taskMaker.skipSubscription = True taskMaker.processWorkload() return workload def testA_BasicTest(self): """ Use the MockPlugin to create a simple test Check to see that all the jobs were "submitted", don't care about thresholds """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 2 nJobs = 20 site = "T2_US_UCSD" self.setResourceThresholds( site, pendingSlots=50, runningSlots=100, tasks=["Processing", "Merge"], Processing={"pendingSlots": 50, "runningSlots": 100}, Merge={"pendingSlots": 50, "runningSlots": 100}, ) jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site=site, ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") # Do pre-submit check getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() # Check that jobs are in the right state result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Check assigned locations getLocationAction = self.daoFactory(classname="Jobs.GetLocation") for jobId in result: loc = getLocationAction.execute(jobid=jobId) self.assertEqual(loc, [["T2_US_UCSD"]]) # Run another cycle, it shouldn't submit anything. There isn't anything to submit jobSubmitter.algorithm() result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) nSubs = 1 nJobs = 10 # Submit another 10 jobs jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site=site, taskType="Merge", ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") # Check that the jobs are available for submission and run another cycle result = getJobsAction.execute(state="Created", jobType="Merge") self.assertEqual(len(result), nSubs * nJobs) jobSubmitter.algorithm() # Check that the last 10 jobs were submitted as well. result = getJobsAction.execute(state="Created", jobType="Merge") self.assertEqual(len(result), 0) result = getJobsAction.execute(state="Executing", jobType="Merge") self.assertEqual(len(result), nSubs * nJobs) return def testB_thresholdTest(self): """ _testB_thresholdTest_ Check that the threshold management is working, this requires checks on pending/running jobs globally at a site and per task/site """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 5 nJobs = 10 site = "T1_US_FNAL" self.setResourceThresholds( site, pendingSlots=50, runningSlots=200, tasks=["Processing", "Merge"], Processing={"pendingSlots": 45, "runningSlots": -1}, Merge={"pendingSlots": 10, "runningSlots": 20, "priority": 5}, ) # Always initialize the submitter after setting the sites, flaky! jobSubmitter = JobSubmitterPoller(config=config) jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site=site, ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") # Do pre-submit check getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) jobSubmitter.algorithm() # Check that jobs are in the right state, # here we are limited by the pending threshold for the Processing task (45) result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), 5) result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), 45) # Check assigned locations getLocationAction = self.daoFactory(classname="Jobs.GetLocation") for jobId in result: loc = getLocationAction.execute(jobid=jobId) self.assertEqual(loc, [["T1_US_FNAL"]]) # Run another cycle, it shouldn't submit anything. Jobs are still in pending jobSubmitter.algorithm() result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), 5) result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), 45) # Now put 10 Merge jobs, only 5 can be submitted, there we hit the global pending threshold for the site nSubs = 1 nJobs = 10 jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site=site, taskType="Merge", ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") jobSubmitter.algorithm() result = getJobsAction.execute(state="Created", jobType="Merge") self.assertEqual(len(result), 5) result = getJobsAction.execute(state="Executing", jobType="Merge") self.assertEqual(len(result), 5) result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), 5) result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), 45) # Now let's test running thresholds # The scenario will be setup as follows: Move all current jobs as running # Create 300 Processing jobs and 300 merge jobs # Run 5 polling cycles, moving all pending jobs to running in between # Result is, merge is left at 25 running 0 pending and processing is left at 215 running 0 pending # Processing has 135 jobs in queue and Merge 285 # This tests all threshold dynamics including the prioritization of merge over processing nSubs = 1 nJobs = 300 jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site=site, ) jobGroupList.extend( self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site=site, taskType="Merge", ) ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") getRunJobID = self.baDaoFactory(classname="LoadByWMBSID") setRunJobStatus = self.baDaoFactory(classname="SetStatus") for _ in range(5): result = getJobsAction.execute(state="Executing") binds = [] for jobId in result: binds.append({"id": jobId, "retry_count": 0}) runJobIds = getRunJobID.execute(binds) setRunJobStatus.execute([x["id"] for x in runJobIds], "Running") jobSubmitter.algorithm() result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), 215) result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), 135) result = getJobsAction.execute(state="Executing", jobType="Merge") self.assertEqual(len(result), 25) result = getJobsAction.execute(state="Created", jobType="Merge") self.assertEqual(len(result), 285) return def testC_prioritization(self): """ _testC_prioritization_ Check that jobs are prioritized by job type and by oldest workflow """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 1 nJobs = 10 site = "T1_US_FNAL" self.setResourceThresholds( site, pendingSlots=10, runningSlots=-1, tasks=["Processing", "Merge"], Processing={"pendingSlots": 50, "runningSlots": -1}, Merge={"pendingSlots": 10, "runningSlots": -1, "priority": 5}, ) # Always initialize the submitter after setting the sites, flaky! jobSubmitter = JobSubmitterPoller(config=config) jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site=site, name="OldestWorkflow", ) jobGroupList.extend( self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site=site, taskType="Merge", ) ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") jobSubmitter.algorithm() # Merge goes first getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state="Created", jobType="Merge") self.assertEqual(len(result), 0) result = getJobsAction.execute(state="Executing", jobType="Merge") self.assertEqual(len(result), 10) result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), 10) result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), 0) # Create a newer workflow processing, and after some new jobs for an old workflow jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site=site, name="NewestWorkflow", ) jobGroupList.extend( self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site=site, name="OldestWorkflow", ) ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") # Move pending jobs to running getRunJobID = self.baDaoFactory(classname="LoadByWMBSID") setRunJobStatus = self.baDaoFactory(classname="SetStatus") for idx in range(2): result = getJobsAction.execute(state="Executing") binds = [] for jobId in result: binds.append({"id": jobId, "retry_count": 0}) runJobIds = getRunJobID.execute(binds) setRunJobStatus.execute([x["id"] for x in runJobIds], "Running") # Run again on created workflows jobSubmitter.algorithm() result = getJobsAction.execute(state="Created", jobType="Merge") self.assertEqual(len(result), 0) result = getJobsAction.execute(state="Executing", jobType="Merge") self.assertEqual(len(result), 10) result = getJobsAction.execute(state="Created", jobType="Processing") self.assertEqual(len(result), 30 - (idx + 1) * 10) result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), (idx + 1) * 10) # Check that older workflow goes first even with newer jobs getWorkflowAction = self.daoFactory(classname="Jobs.GetWorkflowTask") workflows = getWorkflowAction.execute(result) for workflow in workflows: self.assertEqual(workflow["name"], "OldestWorkflow") return def testD_SubmitFailed(self): """ _testD_SubmitFailed_ Check if jobs without a possible site to run at go to SubmitFailed """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 2 nJobs = 10 jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), site=[], workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() # Jobs should go to submit failed getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state="SubmitFailed", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) return def testE_SiteModesTest(self): """ _testE_SiteModesTest_ Test the behavior of the submitter in response to the different states of the sites """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 1 nJobs = 20 sites = ["T2_US_Florida", "T2_TW_Taiwan", "T3_CO_Uniandes", "T1_US_FNAL"] for site in sites: self.setResourceThresholds( site, pendingSlots=10, runningSlots=-1, tasks=["Processing", "Merge"], Processing={"pendingSlots": 10, "runningSlots": -1}, Merge={"pendingSlots": 10, "runningSlots": -1, "priority": 5}, ) myResourceControl = ResourceControl(config) myResourceControl.changeSiteState("T2_US_Florida", "Draining") # First test that we prefer Normal over drain, and T1 over T2/T3 jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, site=[x for x in sites], task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") jobSubmitter = JobSubmitterPoller(config=config) # Actually run it jobSubmitter.algorithm() getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # All jobs should be at either FNAL, Taiwan or Uniandes. It's a random selection # Check assigned locations getLocationAction = self.daoFactory(classname="Jobs.GetLocation") locationDict = getLocationAction.execute([{"jobid": x} for x in result]) for entry in locationDict: loc = entry["site_name"] self.assertNotEqual(loc, "T2_US_Florida") # Now set everything to down, check we don't submit anything for site in sites: myResourceControl.changeSiteState(site, "Down") jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, site=[x for x in sites], task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") jobSubmitter.algorithm() # Nothing is submitted despite the empty slots at Uniandes and Florida result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Now set everything to Drain and create Merge jobs. Those should be submitted for site in sites: myResourceControl.changeSiteState(site, "Draining") nSubsMerge = 1 nJobsMerge = 5 jobGroupList = self.createJobGroups( nSubs=nSubsMerge, nJobs=nJobsMerge, site=[x for x in sites], task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), taskType="Merge", ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") jobSubmitter.algorithm() result = getJobsAction.execute(state="Executing", jobType="Merge") self.assertEqual(len(result), nSubsMerge * nJobsMerge) # Now set everything to Aborted, and create Merge jobs. Those should fail # since the can only run at one place for site in sites: myResourceControl.changeSiteState(site, "Aborted") nSubsMerge = 1 nJobsMerge = 5 jobGroupList = self.createJobGroups( nSubs=nSubsMerge, nJobs=nJobsMerge, site=[x for x in sites], task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), taskType="Merge", ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") jobSubmitter.algorithm() result = getJobsAction.execute(state="SubmitFailed", jobType="Merge") self.assertEqual(len(result), nSubsMerge * nJobsMerge) result = getJobsAction.execute(state="Executing", jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) return @attr("integration") def testF_PollerProfileTest(self): """ _testF_PollerProfileTest_ Submit a lot of jobs and test how long it takes for them to actually be submitted """ workloadName = "basicWorkload" workload = self.createTestWorkload() config = self.getConfig() changeState = ChangeState(config) nSubs = 100 nJobs = 100 site = "T1_US_FNAL" self.setResourceThresholds( site, pendingSlots=20000, runningSlots=-1, tasks=["Processing", "Merge"], Processing={"pendingSlots": 10000, "runningSlots": -1}, Merge={"pendingSlots": 10000, "runningSlots": -1, "priority": 5}, ) # Always initialize the submitter after setting the sites, flaky! JobSubmitterPoller(config=config) jobGroupList = self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site=site, ) jobGroupList.extend( self.createJobGroups( nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, "workloadTest", workloadName), site=site, taskType="Merge", ) ) for group in jobGroupList: changeState.propagate(group.jobs, "created", "new") # Actually run it startTime = time.time() cProfile.runctx("JobSubmitterPoller(config=config).algorithm()", globals(), locals(), filename="testStats.stat") stopTime = time.time() print "Job took %f seconds to complete" % (stopTime - startTime) p = pstats.Stats("testStats.stat") p.sort_stats("cumulative") p.print_stats() return
def setUp(self): """ setup for test. """ myThread = threading.currentThread() self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.tearDown() self.testInit.setSchema(customModules=[ "WMCore.WMBS", "WMCore.BossAir", "WMCore.ResourceControl", "WMCore.Agent.Database" ], useDefault=False) self.testInit.setupCouch("bossair_t/jobs", "JobDump") self.testInit.setupCouch("bossair_t/fwjrs", "FWJRDump") self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") #Create sites in resourceControl resourceControl = ResourceControl() for site in self.sites: resourceControl.insertSite(siteName=site, pnn='se.%s' % (site), cmsName=site, ceName=site, plugin="CondorPlugin", pendingSlots=1000, runningSlots=2000) resourceControl.insertThreshold(siteName = site, taskType = 'Processing', \ maxSlots = 1000, pendingSlots = 1000) resourceControl.insertSite(siteName='Xanadu', pnn='se.Xanadu', cmsName=site, ceName='Xanadu', plugin="TestPlugin") resourceControl.insertThreshold(siteName = 'Xanadu', taskType = 'Processing', \ maxSlots = 10000, pendingSlots = 10000) resourceControl.insertSite(siteName='jade-cms.hip.fi', pnn='madhatter.csc.fi', cmsName=site, ceName='jade-cms.hip.fi', plugin="ARCPlugin") resourceControl.insertThreshold(siteName = 'jade-cms.hip.fi', taskType = 'Processing', \ maxSlots = 100, pendingSlots = 100) # using this for glite submissions resourceControl.insertSite(siteName='grid-ce-01.ba.infn.it', pnn='storm-se-01.ba.infn.it', cmsName=site, ceName='grid-ce-01.ba.infn.it', plugin='gLitePlugin') resourceControl.insertThreshold(siteName = 'grid-ce-01.ba.infn.it', taskType = 'Processing', \ maxSlots = 50, pendingSlots = 50) # Create user newuser = self.daoFactory(classname="Users.New") newuser.execute(dn="tapas", group_name="phgroup", role_name="cmsrole") # We actually need the user name self.user = getpass.getuser() # Change this to the working dir to keep track of error and log files from condor self.testDir = self.testInit.generateWorkDir() # Set heartbeat componentName = 'test' self.heartbeatAPI = HeartbeatAPI(componentName) self.heartbeatAPI.registerComponent() componentName = 'JobTracker' self.heartbeatAPI2 = HeartbeatAPI(componentName) self.heartbeatAPI2.registerComponent() return
class Harness: """ Harness class that wraps standard functionality used in all daemon components """ def __init__(self, config, compName=None): """ init The constructor is empty as we have an initalization method that can be called inside new threads (we use thread local attributes at startup. Default intialization of the harness including setting some diagnostic messages """ self.config = config # component name is always the class name of child class if not compName: compName = self.__class__.__name__ if not compName in (self.config.listComponents_() + self.config.listWebapps_()): raise WMException(WMEXCEPTION["WMCORE-8"] + compName, "WMCORE-8") if not hasattr(self.config, "Agent"): self.config.section_("Agent") self.config.Agent.componentName = compName compSect = getattr(self.config, compName, None) if compSect == None: # Then we have a major problem - there's no section with this name logging.error("Could not find section %s in config" % compName) logging.error("We are returning, and hoping you know what you're doing!") logging.debug("Config: %s" % self.config) return # check if componentDir is set if not assign. if getattr(compSect, "componentDir", None) == None: if not hasattr(self.config, "General"): # Don't do anything. Assume the user knows what they are doing. logging.error("Missing componentDir and General section in config") logging.error("Going to trust you to know what you're doing.") return compSect.componentDir = os.path.join( self.config.General.workDir, "Components", self.config.Agent.componentName ) # we have name and location of the log files. Now make sure there # is a directory. try: if not os.path.isdir(compSect.componentDir): os.makedirs(compSect.componentDir) except Exception as ex: logging.error("Encountered exception while making componentDirs: %s" % str(ex)) logging.error("Ignoring") self.threadManagerName = "" self.heartbeatAPI = None self.messages = {} self.logMsg = {} return def initInThread(self): """ Default intialization of the harness including setting some diagnostic messages. This method is called when we call 'prepareToStart' """ try: self.messages = {} compName = self.config.Agent.componentName compSect = getattr(self.config, compName, None) if not hasattr(compSect, "logFile"): if not getattr(compSect, "componentDir", None): errorMessage = "No componentDir for log entries found!\n" errorMessage += "Harness cannot run without componentDir.\n" logging.error(errorMessage) raise HarnessException(errorMessage) compSect.logFile = os.path.join(compSect.componentDir, "ComponentLog") print("Log file is: " + compSect.logFile) logHandler = RotatingFileHandler(compSect.logFile, "a", 1000000000, 3) logMsgFormat = getattr( compSect, "logMsgFormat", "%(asctime)s:%(thread)d:%(levelname)s:%(module)s:%(message)s" ) logFormatter = logging.Formatter(logMsgFormat) logHandler.setFormatter(logFormatter) logLevelName = getattr(compSect, "logLevel", "INFO") logLevel = getattr(logging, logLevelName) logging.getLogger().addHandler(logHandler) logging.getLogger().setLevel(logLevel) self.logMsg = { "DEBUG": logging.DEBUG, "ERROR": logging.ERROR, "NOTSET": logging.NOTSET, "CRITICAL": logging.CRITICAL, "WARNING": logging.WARNING, "INFO": logging.INFO, "SQLDEBUG": logging.SQLDEBUG, } if hasattr(compSect, "logLevel") and compSect.logLevel in self.logMsg.keys(): logging.getLogger().setLevel(self.logMsg[compSect.logLevel]) WMLogging.sqldebug("wmcore level debug:") # If not previously set, force wmcore cache to current path if not os.environ.get("WMCORE_CACHE_DIR"): os.environ["WMCORE_CACHE_DIR"] = os.path.join(compSect.componentDir, ".wmcore_cache") logging.info(">>>Starting: " + compName + "<<<") # check which backend to use: MySQL, Oracle, etc... for core # services. # we recognize there can be more than one database. # be we offer a default database that is used for core services. logging.info(">>>Initializing default database") logging.info(">>>Check if connection is through socket") myThread = threading.currentThread() myThread.logger = logging.getLogger() logging.info(">>>Setting config for thread: ") myThread.config = self.config logging.info(">>>Building database connection string") # check if there is a premade string if not build it yourself. dbConfig = ConfigDBMap(self.config) dbStr = dbConfig.getDBUrl() options = dbConfig.getOption() # we only want one DBFactory per database so we will need to # to pass this on in case we are using threads. myThread.dbFactory = DBFactory(myThread.logger, dbStr, options) myThread.sql_transaction = True if myThread.dbFactory.engine: myThread.dbi = myThread.dbFactory.connect() myThread.transaction = Transaction(myThread.dbi) else: myThread.dbi = myThread.config.CoreDatabase.connectUrl myThread.sql_transaction = False # Attach a worker manager object to the main thread if not hasattr(myThread, "workerThreadManager"): myThread.workerThreadManager = WorkerThreadManager(self) else: myThread.workerThreadManager.terminateSlaves.clear() myThread.workerThreadManager.pauseWorkers() logging.info(">>>Initialize transaction dictionary") (connectDialect, junk) = dbStr.split(":", 1) if connectDialect.lower() == "mysql": myThread.dialect = "MySQL" elif connectDialect.lower() == "oracle": myThread.dialect = "Oracle" elif connectDialect.lower() == "sqlite": myThread.dialect = "SQLite" logging.info("Harness part constructor finished") except Exception as ex: logging.critical("Problem instantiating " + str(ex)) logging.error("Traceback: %s" % str(traceback.format_exc())) raise def preInitialization(self): """ _preInitialization_ returns: nothing method that can be overloaded and will be called before the start component is called. (enables you to set message->handler mappings). You use the self.message dictionary of the base class to define the mappings. """ pass def postInitialization(self): """ _postInitialization_ returns: nothing method that can be overloaded and will be called after the start component does the standard initialization, but before the wait (enables you to publish events when starting up) Define actions you want to execute before the actual message handling starts. E.g.: publishing some messages, or removing messages. """ pass def logState(self): """ _logState_ returns: string method that can be overloaded to log additional state information (should return atring) """ msg = "No additional state information for " + self.config.Agent.componentName return msg def publishItem(self, items): """ _publishItem_ returns: nothing A method that publishes a (dictionary) set or 1 item to a monitoring service. """ # FIXME: do we need this method. If so we need to agree # FIXME: on some default monitoring publication mechanism. pass def __call__(self, event, payload): """ Once upon a time this was for doing the handling of diagnostic messages With the test-deprecating of the MsgService based diagnostics, we've basically scratched this. I'm leaving this in so at least the framework is still there -mnorman """ return def initialization(self): """ _initialization__ Used the handle initializing the MsgService. The MsgService is no longer used. Removed but not deleted, since all sorts of things call it """ return def prepareToStart(self): """ _prepareToStart_ returns: Nothing Starts the initialization procedure. It is mainly an aggregation method so it can easily used in tests. """ self.state = "initialize" self.initInThread() # note: every component gets a (unique) name: # self.config.Agent.componentName logging.info(">>>Registering Component - %s" % self.config.Agent.componentName) if getattr(self.config.Agent, "useHeartbeat", True): self.heartbeatAPI = HeartbeatAPI(self.config.Agent.componentName) self.heartbeatAPI.registerComponent() logging.info(">>>Starting initialization") logging.info(">>>Setting default transaction") myThread = threading.currentThread() self.preInitialization() if myThread.sql_transaction: myThread.transaction.begin() self.initialization() self.postInitialization() if myThread.sql_transaction: myThread.transaction.commit() logging.info(">>>Committing default transaction") logging.info(">>>Starting worker threads") myThread.workerThreadManager.resumeWorkers() logging.info(">>>Initialization finished!\n") # wait for messages self.state = "active" def prepareToStop(self, wait=False, stopPayload=""): """ _stopComponent Stops the component, including all worker threads. Allows call from test framework """ # Stop all worker threads logging.info(">>>Terminating worker threads") myThread = threading.currentThread() try: myThread.workerThreadManager.terminateWorkers() except: # We may not have a thread manager pass if wait: logging.info(">>>Shut down of component " + "while waiting for threads to finish") # check if nr of threads is specified. activeThreads = 1 if stopPayload != "": activeThreads = int(stopPayload) if activeThreads < 1: activeThreads = 1 while threading.activeCount() > activeThreads: logging.info(">>>Currently " + str(threading.activeCount()) + " threads active") logging.info(">>>Waiting for less then " + str(activeThreads) + " to be active") time.sleep(5) def handleMessage(self, type="", payload=""): """ __handleMessage_ Formerly used to handle messages - now non-functional Left here in case someone else is using it (i.e. PilotManager) """ return def startDaemon(self, keepParent=False, compName=None): """ Same result as start component, except that the comopnent is started as a daemon, after which you can close your xterm and the process will still run. The keepParent option enables us to keep the parent process which is used during testing, """ msg = "Starting %s as a daemon " % (self.config.Agent.componentName) print(msg) if not compName: compName = self.__class__.__name__ compSect = getattr(self.config, compName, None) msg = "Log will be in %s " % (compSect.componentDir) print(msg) # put the daemon config file in the work dir of this component. # FIXME: this file will be replaced by a database table. compSect = getattr(self.config, self.config.Agent.componentName, None) pid = createDaemon(compSect.componentDir, keepParent) # if this is not the parent start the component if pid == 0: self.startComponent() # if this is the parent return control to the testing environment. def startComponent(self): """ _startComponent_ returns: Nothing Start up the component, performs initialization and waits indefinitely Calling this method results in the application running in the xterm (not in daemon mode) """ myThread = threading.currentThread() try: msg = "None" self.prepareToStart() while True: time.sleep(360) except Exception as ex: if self.state == "initialize": errormsg = """PostMortem: choked when initializing with error: %s\n""" % (str(ex)) stackTrace = traceback.format_tb(sys.exc_info()[2], None) for stackFrame in stackTrace: errormsg += stackFrame else: errormsg = "" stackTrace = traceback.format_tb(sys.exc_info()[2], None) for stackFrame in stackTrace: errormsg += stackFrame logging.error(errormsg) logging.error(">>>Fatal Error, Preparing to Rollback Transaction") if getattr(myThread, "transaction", None) != None: myThread.transaction.rollback() self.prepareToStop(False) errormsg = """ PostMortem: choked while handling messages with error: %s while trying to handle msg: %s """ % ( str(ex), str(msg), ) print(errormsg) logging.critical(errormsg) raise logging.info("System shutdown complete!") # this is to ensure exiting when in daemon mode. sys.exit() def __str__(self): """ return: string String representation of the status of this component. """ msg = "Status of this component : \n" msg += "\n" msg += ">>Event Subscriptions --> Handlers<<\n" msg += "------------------------------------\n" for message in self.messages.keys(): msg += message + "-->" + str(self.messages[message]) + "\n" msg += "\n" msg += "\n" msg += ">>Parameters --> Values<<\n" msg += "-------------------------\n" msg += str(self.config) additionalMsg = self.logState() if additionalMsg != "": msg += "\n" msg += "Additional state information\n" msg += "----------------------------\n" msg += "\n" msg += str(additionalMsg) msg += "\n" return msg
class DBSUploadTest(unittest.TestCase): """ _DBSUploadTest_ TestCase for DBSUpload module """ _maxMessage = 10 def setUp(self): """ _setUp_ setUp function for unittest """ # Set constants self.couchDB = "config_test" self.configURL = "RANDOM;;URL;;NAME" self.configString = "This is a random string" self.testInit = TestInit(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules = ["WMComponent.DBS3Buffer", 'WMCore.Agent.Database'], useDefault = False) self.testInit.setupCouch(self.couchDB, "GroupUser", "ConfigCache") myThread = threading.currentThread() self.bufferFactory = DAOFactory(package = "WMComponent.DBSBuffer.Database", logger = myThread.logger, dbinterface = myThread.dbi) self.buffer3Factory = DAOFactory(package = "WMComponent.DBS3Buffer", logger = myThread.logger, dbinterface = myThread.dbi) locationAction = self.bufferFactory(classname = "DBSBufferFiles.AddLocation") locationAction.execute(siteName = "se1.cern.ch") locationAction.execute(siteName = "se1.fnal.gov") locationAction.execute(siteName = "malpaquet") # Set heartbeat self.componentName = 'JobSubmitter' self.heartbeatAPI = HeartbeatAPI(self.componentName) self.heartbeatAPI.registerComponent() # Set up a config cache configCache = ConfigCache(os.environ["COUCHURL"], couchDBName = self.couchDB) configCache.createUserGroup(groupname = "testGroup", username = '******') self.testDir = self.testInit.generateWorkDir() psetPath = os.path.join(self.testDir, "PSet.txt") f = open(psetPath, 'w') f.write(self.configString) f.close() configCache.addConfig(newConfig = psetPath, psetHash = None) configCache.save() self.configURL = "%s;;%s;;%s" % (os.environ["COUCHURL"], self.couchDB, configCache.getCouchID()) return def tearDown(self): """ _tearDown_ tearDown function for unittest """ self.testInit.clearDatabase() self.testInit.tearDownCouch() self.testInit.delWorkDir() return def createConfig(self): """ _createConfig_ This creates the actual config file used by the component """ config = Configuration() #First the general stuff config.section_("General") config.General.workDir = os.getenv("TESTDIR", os.getcwd()) config.section_("Agent") config.Agent.componentName = 'DBSUpload' config.Agent.useHeartbeat = False #Now the CoreDatabase information #This should be the dialect, dburl, etc config.section_("CoreDatabase") config.CoreDatabase.connectUrl = os.getenv("DATABASE") config.CoreDatabase.socket = os.getenv("DBSOCK") config.component_("DBSUpload") config.DBSUpload.pollInterval = 10 config.DBSUpload.logLevel = 'ERROR' config.DBSUpload.maxThreads = 1 config.DBSUpload.namespace = 'WMComponent.DBSUpload.DBSUpload' config.DBSUpload.componentDir = os.path.join(os.getcwd(), 'Components') config.DBSUpload.workerThreads = 4 config.section_("DBSInterface") config.DBSInterface.globalDBSUrl = 'http://vocms09.cern.ch:8880/cms_dbs_int_local_xx_writer/servlet/DBSServlet' config.DBSInterface.globalDBSVersion = 'DBS_2_0_9' config.DBSInterface.DBSUrl = 'http://vocms09.cern.ch:8880/cms_dbs_int_local_yy_writer/servlet/DBSServlet' config.DBSInterface.DBSVersion = 'DBS_2_0_9' config.DBSInterface.MaxFilesToCommit = 10 # addition for Alerts messaging framework, work (alerts) and control # channel addresses to which the component will be sending alerts # these are destination addresses where AlertProcessor:Receiver listens config.section_("Alert") config.Alert.address = "tcp://127.0.0.1:5557" config.Alert.controlAddr = "tcp://127.0.0.1:5559" # configure threshold of DBS upload queue size alert threshold # reference: trac ticket #1628 config.DBSUpload.alertUploadQueueSize = 2000 return config def injectWorkflow(self, workflowName = 'TestWorkflow', taskPath = '/TestWorkflow/ReadingEvents', MaxWaitTime = 10000, MaxFiles = 10, MaxEvents = 250000000, MaxSize = 9999999999): """ _injectWorklow_ Inject a dummy worklow in DBSBuffer for testing, returns the workflow ID """ injectWorkflowDAO = self.buffer3Factory("InsertWorkflow") workflowID = injectWorkflowDAO.execute(workflowName, taskPath, MaxWaitTime, MaxFiles, MaxEvents, MaxSize) return workflowID def getFiles(self, name, tier, nFiles = 12, site = "malpaquet", workflowName = None, taskPath = None, noChild = False): """ Create some quick dummy test files """ if workflowName is not None and taskPath is not None: workflowId = self.injectWorkflow(workflowName = workflowName, taskPath = taskPath) else: workflowId = self.injectWorkflow() files = [] for f in range(0, nFiles): testFile = DBSBufferFile(lfn = '%s-%s-%i' % (name, site, f), size = 1024, events = 20, checksums = {'cksum': 1}, workflowId = workflowId) testFile.setAlgorithm(appName = name, appVer = "CMSSW_3_1_1", appFam = "RECO", psetHash = "GIBBERISH", configContent = self.configURL) testFile.setDatasetPath("/%s/%s/%s" % (name, name, tier)) testFile.addRun(Run( 1, *[f])) testFile.setGlobalTag("aGlobalTag") testFile.create() testFile.setLocation(site) files.append(testFile) if not noChild: testFileChild = DBSBufferFile(lfn = '%s-%s-child' %(name, site), size = 1024, events = 10, checksums = {'cksum': 1}, workflowId = workflowId) testFileChild.setAlgorithm(appName = name, appVer = "CMSSW_3_1_1", appFam = "RECO", psetHash = "GIBBERISH", configContent = self.configURL) testFileChild.setDatasetPath("/%s/%s_2/RECO" %(name, name)) testFileChild.addRun(Run( 1, *[45])) testFileChild.setGlobalTag("aGlobalTag") testFileChild.create() testFileChild.setLocation(site) testFileChild.addParents([x['lfn'] for x in files]) return files @attr('integration') def testA_basicUploadTest(self): """ _basicUploadTest_ Do everything simply once Create dataset, algo, files, blocks, upload them, mark as done, finish them, migrate them Also check the timeout """ myThread = threading.currentThread() config = self.createConfig() self.injectWorkflow(MaxWaitTime = 3) config.DBSUpload.pollInterval = 4 name = "ThisIsATest_%s" % (makeUUID()) tier = "RECO" nFiles = 12 files = self.getFiles(name = name, tier = tier, nFiles = nFiles) datasetPath = '/%s/%s/%s' % (name, name, tier) # Load components that are necessary to check status factory = WMFactory("dbsUpload", "WMComponent.DBSUpload.Database.Interface") dbinterface = factory.loadObject("UploadToDBS") dbsInterface = DBSInterface(config = config) localAPI = dbsInterface.getAPIRef() globeAPI = dbsInterface.getAPIRef(globalRef = True) # In the first round we should create blocks for the first dataset # The child dataset should not be handled until the parent is uploaded testDBSUpload = DBSUploadPoller(config = config) testDBSUpload.algorithm() # First, see if there are any blocks # One in DBS, one not in DBS result = myThread.dbi.processData("SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(len(result), 2) self.assertEqual(result, [('InGlobalDBS',), ('Open',)]) # Check to see if datasets and algos are in local DBS result = listAlgorithms(apiRef = localAPI, patternExe = name) self.assertEqual(len(result), 1) self.assertEqual(result[0]['ExecutableName'], name) result = listPrimaryDatasets(apiRef = localAPI, match = name) self.assertEqual(result, [name]) result = listProcessedDatasets(apiRef = localAPI, primary = name, dataTier = "*") # Then check and see that the closed block made it into local DBS affectedBlocks = listBlocks(apiRef = localAPI, datasetPath = datasetPath) if affectedBlocks[0]['OpenForWriting'] == '0': self.assertEqual(affectedBlocks[1]['OpenForWriting'], '1') self.assertEqual(affectedBlocks[0]['NumberOfFiles'], 10) self.assertEqual(affectedBlocks[1]['NumberOfFiles'], 2) else: self.assertEqual(affectedBlocks[0]['OpenForWriting'], '1') self.assertEqual(affectedBlocks[1]['NumberOfFiles'], 10) self.assertEqual(affectedBlocks[0]['NumberOfFiles'], 2) # Check to make sure all the files are in local result = listDatasetFiles(apiRef = localAPI, datasetPath = datasetPath) fileLFNs = [x['lfn'] for x in files] for lfn in fileLFNs: self.assertTrue(lfn in result) # Make sure the child files aren't there flag = False try: listDatasetFiles(apiRef = localAPI, datasetPath = '/%s/%s_2/%s' % (name, name, tier)) except Exception as ex: flag = True self.assertTrue(flag) # There should be one blocks in global # It should have ten files and be closed result = listBlocks(apiRef = globeAPI, datasetPath = datasetPath) self.assertEqual(len(result), 1) for block in result: self.assertEqual(block['OpenForWriting'], '0') self.assertTrue(block['NumberOfFiles'] in [2, 10]) # Okay, deep breath. First round done # In the second round, the second block of the parent fileset should transfer # Make sure that the timeout functions work time.sleep(10) testDBSUpload.algorithm() result = myThread.dbi.processData("SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(len(result), 2) self.assertEqual(result, [('InGlobalDBS',), ('InGlobalDBS',)]) # Check to make sure all the files are in global result = listDatasetFiles(apiRef = globeAPI, datasetPath = datasetPath) for lfn in fileLFNs: self.assertTrue(lfn in result) # Make sure the child files aren't there flag = False try: listDatasetFiles(apiRef = localAPI, datasetPath = '/%s/%s_2/%s' % (name, name, tier)) except Exception as ex: flag = True self.assertTrue(flag) # Third round # Both of the parent blocks should have transferred # So the child block should now transfer testDBSUpload.algorithm() result = myThread.dbi.processData("SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(result, [('InGlobalDBS',), ('InGlobalDBS',), ('Open',)]) flag = False try: result = listDatasetFiles(apiRef = localAPI, datasetPath = '/%s/%s_2/%s' % (name, name, tier)) except Exception as ex: flag = True self.assertFalse(flag) self.assertEqual(len(result), 1) return @attr('integration') def testB_AlgoMigration(self): """ _AlgoMigration_ Test our ability to migrate multiple algos to global Do this by creating, mid-poll, two separate batches of files One with the same dataset but a different algo One with the same algo, but a different dataset See that they both get to global """ #raise nose.SkipTest myThread = threading.currentThread() config = self.createConfig() self.injectWorkflow(MaxWaitTime = 20) name = "ThisIsATest_%s" % (makeUUID()) tier = "RECO" nFiles = 12 files = self.getFiles(name = name, tier = tier, nFiles = nFiles) datasetPath = '/%s/%s/%s' % (name, name, tier) # Load components that are necessary to check status factory = WMFactory("dbsUpload", "WMComponent.DBSUpload.Database.Interface") dbinterface = factory.loadObject("UploadToDBS") dbsInterface = DBSInterface(config = config) localAPI = dbsInterface.getAPIRef() globeAPI = dbsInterface.getAPIRef(globalRef = True) testDBSUpload = DBSUploadPoller(config = config) testDBSUpload.algorithm() # There should now be one block result = listBlocks(apiRef = globeAPI, datasetPath = datasetPath) self.assertEqual(len(result), 1) # Okay, by now, the first migration should have gone through. # Now create a second batch of files with the same dataset # but a different algo. for i in range(0, nFiles): testFile = DBSBufferFile(lfn = '%s-batch2-%i' %(name, i), size = 1024, events = 20, checksums = {'cksum': 1}, locations = "malpaquet") testFile.setAlgorithm(appName = "cmsRun", appVer = "CMSSW_3_1_1", appFam = tier, psetHash = "GIBBERISH_PART2", configContent = self.configURL) testFile.setDatasetPath(datasetPath) testFile.addRun(Run( 1, *[46])) testFile.create() # Have to do things twice to get parents testDBSUpload.algorithm() testDBSUpload.algorithm() # There should now be two blocks result = listBlocks(apiRef = globeAPI, datasetPath = datasetPath) self.assertEqual(len(result), 2) # Now create another batch of files with the original algo # But in a different dataset for i in range(0, nFiles): testFile = DBSBufferFile(lfn = '%s-batch3-%i' %(name, i), size = 1024, events = 20, checksums = {'cksum': 1}, locations = "malpaquet") testFile.setAlgorithm(appName = name, appVer = "CMSSW_3_1_1", appFam = tier, psetHash = "GIBBERISH", configContent = self.configURL) testFile.setDatasetPath('/%s/%s_3/%s' % (name, name, tier)) testFile.addRun(Run( 1, *[46])) testFile.create() # Do it twice for parentage. testDBSUpload.algorithm() testDBSUpload.algorithm() # There should now be one block result = listBlocks(apiRef = globeAPI, datasetPath = '/%s/%s_3/%s' % (name, name, tier)) self.assertEqual(len(result), 1) # Well, all the blocks got there, so we're done return @attr('integration') def testC_FailTest(self): """ _FailTest_ THIS TEST IS DANGEROUS! Figure out what happens when we trigger rollbacks """ myThread = threading.currentThread() config = self.createConfig() config.DBSUpload.abortStepTwo = True originalOut = sys.stdout originalErr = sys.stderr dbsInterface = DBSInterface(config = config) localAPI = dbsInterface.getAPIRef() globeAPI = dbsInterface.getAPIRef(globalRef = True) name = "ThisIsATest_%s" % (makeUUID()) tier = "RECO" nFiles = 12 files = self.getFiles(name = name, tier = tier, nFiles = nFiles) datasetPath = '/%s/%s/%s' % (name, name, tier) testDBSUpload = DBSUploadPoller(config = config) try: testDBSUpload.algorithm() except Exception as ex: pass # Aborting in step two should result in no results result = myThread.dbi.processData("SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(len(result), 0) config.DBSUpload.abortStepTwo = False config.DBSUpload.abortStepThree = True testDBSUpload = DBSUploadPoller(config = config) try: testDBSUpload.algorithm() except Exception as ex: pass result = myThread.dbi.processData("SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(result, [('Pending',), ('Open',)]) result = myThread.dbi.processData("SELECT status FROM dbsbuffer_file WHERE dataset_algo = 1")[0].fetchall() for res in result: self.assertEqual(res[0], 'READY') config.DBSUpload.abortStepThree = False self.injectWorkflow(MaxWaitTime = 300) testDBSUpload = DBSUploadPoller(config = config) testDBSUpload.algorithm() # After this, one block should have been uploaded, one should still be open # This is the result of the pending block updating, and the open block staying open result = myThread.dbi.processData("SELECT status, id FROM dbsbuffer_block")[0].fetchall() self.assertEqual(result, [('InGlobalDBS', 3L), ('Open', 4L)]) # Check that one block got there result = listBlocks(apiRef = globeAPI, datasetPath = datasetPath) self.assertEqual(len(result), 1) self.assertEqual(result[0]['NumberOfFiles'], 10) self.assertEqual(result[0]['NumberOfEvents'], 200) self.assertEqual(result[0]['BlockSize'], 10240) # Check that ten files got there result = listDatasetFiles(apiRef = globeAPI, datasetPath = datasetPath) self.assertEqual(len(result), 10) myThread.dbi.processData("UPDATE dbsbuffer_workflow SET block_close_max_wait_time = 1") testDBSUpload = DBSUploadPoller(config = config) time.sleep(3) testDBSUpload.algorithm() result = myThread.dbi.processData("SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(result, [('InGlobalDBS',), ('InGlobalDBS',)]) result = listDatasetFiles(apiRef = globeAPI, datasetPath = datasetPath) self.assertEqual(len(result), 12) fileLFNs = [x['lfn'] for x in files] for lfn in fileLFNs: self.assertTrue(lfn in result) testDBSUpload.algorithm() result = myThread.dbi.processData("SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(result, [('InGlobalDBS',), ('InGlobalDBS',), ('Open',)]) time.sleep(5) testDBSUpload.algorithm() time.sleep(2) result = myThread.dbi.processData("SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(result, [('InGlobalDBS',), ('InGlobalDBS',), ('InGlobalDBS',)]) result = listDatasetFiles(apiRef = globeAPI, datasetPath = '/%s/%s_2/%s' % (name, name, tier)) self.assertEqual(len(result), 1) sys.stdout = originalOut sys.stderr = originalErr return @attr('integration') def testD_Profile(self): """ _Profile_ Profile with cProfile and time various pieces """ return config = self.createConfig() name = "ThisIsATest_%s" % (makeUUID()) tier = "RECO" nFiles = 500 files = self.getFiles(name = name, tier = tier, nFiles = nFiles) datasetPath = '/%s/%s/%s' % (name, name, tier) testDBSUpload = DBSUploadPoller(config = config) cProfile.runctx("testDBSUpload.algorithm()", globals(), locals(), filename = "testStats.stat") p = pstats.Stats('testStats.stat') p.sort_stats('cumulative') p.print_stats(0.2) return @attr('integration') def testE_NoMigration(self): """ _NoMigration_ Test the DBSUpload system with no global migration """ myThread = threading.currentThread() config = self.createConfig() self.injectWorkflow(MaxWaitTime = 3) config.DBSInterface.doGlobalMigration = False config.DBSUpload.pollInterval = 4 name = "ThisIsATest_%s" % (makeUUID()) tier = "RECO" nFiles = 12 files = self.getFiles(name = name, tier = tier, nFiles = nFiles) datasetPath = '/%s/%s/%s' % (name, name, tier) # Load components that are necessary to check status factory = WMFactory("dbsUpload", "WMComponent.DBSUpload.Database.Interface") dbinterface = factory.loadObject("UploadToDBS") dbsInterface = DBSInterface(config = config) localAPI = dbsInterface.getAPIRef() globeAPI = dbsInterface.getAPIRef(globalRef = True) # In the first round we should create blocks for the first dataset # The child dataset should not be handled until the parent is uploaded testDBSUpload = DBSUploadPoller(config = config) testDBSUpload.algorithm() # First, see if there are any blocks # One in DBS, one not in DBS result = myThread.dbi.processData("SELECT status FROM dbsbuffer_block")[0].fetchall() self.assertEqual(len(result), 2) self.assertEqual(result, [('InGlobalDBS',), ('Open',)]) result = myThread.dbi.processData("SELECT status FROM dbsbuffer_file WHERE dataset_algo = 1")[0].fetchall() for r in result: self.assertEqual(r[0], 'GLOBAL') return @attr('integration') def testF_DBSUploadQueueSizeCheckForAlerts(self): """ Test will not trigger a real alert being sent unless doing some mocking of the methods used during DBSUploadPoller.algorithm() -> DBSUploadPoller.uploadBlocks() method. As done here, it probably can't be deterministic, yet the feature shall be checked. """ sizeLevelToTest = 1 myThread = threading.currentThread() config = self.createConfig() # threshold / value to check config.DBSUpload.alertUploadQueueSize = sizeLevelToTest # without this uploadBlocks method returns immediately name = "ThisIsATest_%s" % (makeUUID()) tier = "RECO" nFiles = sizeLevelToTest + 1 files = self.getFiles(name = name, tier = tier, nFiles = nFiles) datasetPath = '/%s/%s/%s' % (name, name, tier) # load components that are necessary to check status # (this seems necessary, else some previous tests started failing) factory = WMFactory("dbsUpload", "WMComponent.DBSUpload.Database.Interface") dbinterface = factory.loadObject("UploadToDBS") dbsInterface = DBSInterface(config = config) localAPI = dbsInterface.getAPIRef() globeAPI = dbsInterface.getAPIRef(globalRef = True) testDBSUpload = DBSUploadPoller(config) # this is finally where the action (alert) should be triggered from testDBSUpload.algorithm() return def testG_closeSettingsPerWorkflow(self): """ _closeSettingsPerWorkflow_ Test our ability to close blocks depending on settings configured for individual workflows. This unit test that doesn't require an actual DBS instance to run. """ self.assertTrue(False, 'This unit test disabled since we do not have DBS2 mock') myThread = threading.currentThread() config = self.createConfig() config.DBSInterface.doGlobalMigration = False # First test, limit by number of files and timeout without new files name = "ThisIsATest_%s" % (makeUUID()) tier = "RECO" nFiles = 12 self.injectWorkflow(workflowName = name, taskPath = '/%s/Test' % name, MaxFiles = 5) self.getFiles(name = name, tier = tier, nFiles = nFiles, workflowName = name, taskPath = '/%s/Test' % name) # Load components that are necessary to check status factory = WMFactory("dbsUpload", "WMComponent.DBSUpload.Database.Interface") dbinterface = factory.loadObject("UploadToDBS") # Change the DBSUploadPoller imports on runtime from WMComponent.DBSUpload import DBSUploadPoller as MockDBSUploadPoller #MockDBSUploadPoller.DBSInterface = DBS2Interface # In the first round we should create blocks for the first dataset # The child dataset should not be handled until the parent is uploaded # First run creates 3 blocks, 2 are closed immediately and one is open testDBSUpload = MockDBSUploadPoller.DBSUploadPoller(config = config) testDBSUpload.algorithm() openBlocks = dbinterface.findOpenBlocks() closedBlocks = myThread.dbi.processData("SELECT id FROM dbsbuffer_block WHERE status = 'InGlobalDBS'")[0].fetchall() self.assertEqual(len(openBlocks), 1) self.assertEqual(len(closedBlocks), 2) globalFiles = myThread.dbi.processData("SELECT id FROM dbsbuffer_file WHERE status = 'GLOBAL'")[0].fetchall() notUploadedFiles = myThread.dbi.processData("SELECT * FROM dbsbuffer_file WHERE status = 'NOTUPLOADED'")[0].fetchall() self.assertEqual(len(globalFiles), 12) self.assertEqual(len(notUploadedFiles), 1) self.assertTrue('child' in notUploadedFiles[0][1]) testDBSUpload.algorithm() openBlocks = myThread.dbi.processData("SELECT id FROM dbsbuffer_block WHERE status != 'InGlobalDBS'")[0].fetchall() closedBlocks = myThread.dbi.processData("SELECT id FROM dbsbuffer_block WHERE status = 'InGlobalDBS'")[0].fetchall() self.assertEqual(len(openBlocks), 2) self.assertEqual(len(closedBlocks), 2) globalFiles = myThread.dbi.processData("SELECT id FROM dbsbuffer_file WHERE status = 'GLOBAL'")[0].fetchall() notUploadedFiles = myThread.dbi.processData("SELECT * FROM dbsbuffer_file WHERE status = 'NOTUPLOADED'")[0].fetchall() self.assertEqual(len(globalFiles), 13) self.assertEqual(len(notUploadedFiles), 0) # Test the timeout feature to close blocks myThread.dbi.processData("UPDATE dbsbuffer_workflow SET block_close_max_wait_time = 0") testDBSUpload.algorithm() openBlocks = myThread.dbi.processData("SELECT id FROM dbsbuffer_block WHERE status != 'InGlobalDBS'")[0].fetchall() closedBlocks = myThread.dbi.processData("SELECT id FROM dbsbuffer_block WHERE status = 'InGlobalDBS'")[0].fetchall() self.assertEqual(len(openBlocks), 0) self.assertEqual(len(closedBlocks), 4) # Check the information that DBS received dbsBlocks = testDBSUpload.dbsInterface.blocks for dbsBlockName in dbsBlocks: dbsBlock = dbsBlocks[dbsBlockName] self.assertEqual(dbsBlock['OpenForWriting'], '0') self.assertTrue(dbsBlock['nFiles'] in (1,2,5)) # Second test, limit by number of events and timeout with new files name = "ThisIsATest_%s" % (makeUUID()) nFiles = 50 self.injectWorkflow(workflowName = name, taskPath = '/%s/Test' % name, MaxFiles = 45, MaxEvents = 800, MaxWaitTime = 10000) self.getFiles(name = name, tier = tier, nFiles = nFiles, workflowName = name, taskPath = '/%s/Test' % name) testDBSUpload.algorithm() testDBSUpload.algorithm() openBlocks = myThread.dbi.processData("SELECT id FROM dbsbuffer_block WHERE status != 'InGlobalDBS'")[0].fetchall() closedBlocks = myThread.dbi.processData("SELECT id FROM dbsbuffer_block WHERE status = 'InGlobalDBS'")[0].fetchall() self.assertEqual(len(openBlocks), 2) self.assertEqual(len(closedBlocks), 5) # Throw 20 new file # Reset the timer such that the blocks appear to have been created 10001 seconds ago creationTime = int(time.time() - 10001) myThread.dbi.processData("UPDATE dbsbuffer_block SET create_time = %d WHERE status != 'InGlobalDBS'" % creationTime) self.getFiles(name = name + '2', tier = tier, nFiles = 20, workflowName = name, taskPath = '/%s/Test' % name, noChild = True) # Now a new block will have to be created as the last one timed out testDBSUpload.algorithm() openBlocks = myThread.dbi.processData("SELECT id FROM dbsbuffer_block WHERE status != 'InGlobalDBS'")[0].fetchall() closedBlocks = myThread.dbi.processData("SELECT id FROM dbsbuffer_block WHERE status = 'InGlobalDBS'")[0].fetchall() self.assertEqual(len(openBlocks), 1) self.assertEqual(len(closedBlocks), 7) dbsBlocks = testDBSUpload.dbsInterface.blocks for dbsBlockName in dbsBlocks: dbsBlock = dbsBlocks[dbsBlockName] if name in dbsBlockName: if dbsBlock['OpenForWriting'] == '1': self.assertEqual(dbsBlock['nFiles'], 20) else: self.assertTrue(dbsBlock['events'] in (10,200,800)) self.assertTrue(dbsBlock['nFiles'] in (1,10,40)) # Last test, check limitation by size name = "ThisIsATest_%s" % (makeUUID()) nFiles = 10 self.injectWorkflow(workflowName = name, taskPath = '/%s/Test' % name, MaxFiles = 45, MaxEvents = 800, MaxSize = 2048) self.getFiles(name = name, tier = tier, nFiles = nFiles, workflowName = name, taskPath = '/%s/Test' % name) testDBSUpload.algorithm() dbsBlocks = testDBSUpload.dbsInterface.blocks for dbsBlockName in dbsBlocks: dbsBlock = dbsBlocks[dbsBlockName] if name in dbsBlockName: self.assertEqual(dbsBlock['events'], 40) self.assertEqual(dbsBlock['nFiles'], 2) self.assertEqual(dbsBlock['size'], 2048) return