def checkMPI(self): from mpi4casa.MPIInterface import MPIInterface as mpi_clustermanager try: self.nproc = len( mpi_clustermanager.getCluster()._cluster.get_engines()) return True except Exception as e: self.nproc = 0 return False
def test_mpi4casa_log_level_default_to_debug(self): """Test changing globally log level from default to debug """ # Change log level globally (test via MPIInterface as it internally uses MPICommandClient so both are tested) mpi_interface = MPIInterface() mpi_interface.set_log_level("DEBUG") # Use a separated log file per server to facilitate analysis for server in self.server_list: logfile = 'test_mpi4casa_log_level_debug-server-%s.log' % str(server) self.client.push_command_request("casalog.setlogfile('%s')" % (logfile),True,server) # Run flagdata flagdata(vis=self.vis, mode='summary') # Iterate trough log files to see if we find command handling msgs for server in self.server_list: # Get current working directory (we might be in the 'nosedir' subdirectory) cwd = self.client.push_command_request("os.getcwd()",True,server)[0]['ret'] logfile = '%s/test_mpi4casa_log_level_debug-server-%s.log' % (cwd,str(server)) content = open(logfile, 'r').read() if content.find('flagdata')>0: # Check only server with processed a flagdata sub-job self.assertEqual(content.find("MPICommandServer")<0, True, "MPICommandServer msgs should be filtered out")
def setupCluster(self): # Initialize cluster # * Terminal: Client logs + Server logs # * casapy-<timestamp>.log: Client logs # * casapy-<timestamp>.log-server-<rank>-host-<hostname>-pid-<pid>: Server logs mpi_clustermanager.set_log_mode('redirect'); self.sc=mpi_clustermanager.getCluster() self.sc.set_log_level('DEBUG') self.CL=self.sc._cluster self.nodeList = self.CL.get_engines(); numproc=len(self.CL.get_engines()) numprocperhost=len(self.nodeList)/len(self.nodeList) if (len(self.nodeList) >0 ) else 1 owd=os.getcwd() self.CL.pgc('import os') self.CL.pgc('from numpy import array,int32') self.CL.pgc('os.chdir("'+owd+'")') os.chdir(owd) print "Setting up ", numproc, " engines." return numproc
def getNParts(self, imprefix='', imexts=[]): from mpi4casa.MPIInterface import MPIInterface as mpi_clustermanager try: self.nproc = len( mpi_clustermanager.getCluster()._cluster.get_engines()) except Exception as e: self.nproc = 0 if (self.nproc > 0): imlist = [] for imext in imexts: for part in range(1, self.nproc + 1): imlist.append(imprefix + '.workdirectory/' + imprefix + '.n' + str(part) + '.' + imext) #self.checkall(imexist = imlist) else: print 'Not a parallel run of CASA' return imlist
def test_PyParallelImagerHelper_interface(self): # Get cluster (getCluster should automatically initialize it) self.sc = MPIInterface.getCluster() self.CL = self.sc._cluster self.assertEqual(self.sc.isClusterRunning(),True,"Error instantiating cluster") # Get engines engines = self.CL.get_engines() self.assertEqual(engines,range(1,MPIEnvironment.mpi_world_size),"Error getting list of engines") # Get nodes if int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])>1: nodes = self.CL.get_nodes() self.assertTrue(socket.gethostname() in nodes,"Error getting list of nodes") # Run imports in all engines self.CL.pgc('import os') self.CL.pgc('from numpy import array,int32') os_is_module = self.CL.pgc('os is not None')[0]['ret'] self.assertEqual(os_is_module,True,"Error importing os module") # Change current working directory cwd=os.getcwd() self.CL.pgc('os.chdir("' + cwd + '")') res = self.CL.pgc('os.getcwd()')[0]['ret'] self.assertEqual(res,cwd,"Error changing work directory") # Get engine working directory cwd=os.getcwd() res = self.sc.get_engine_store(1) self.assertEqual(res,cwd,"Error getting engine store") # pgc/Pull variable to/from all servers self.CL.pgc("initrec = casac.utils().hostinfo()['endian']") res = self.CL.pull('initrec') self.assertEqual(res[1],casac.utils().hostinfo()['endian'],"Error pulling a variable") # Push/Pull variable to/from a subset of servers var_dict={} var_dict['a'] = 33 var_dict['b'] = {'test':29.2} self.CL.push(var_dict,[1,2]) res = self.CL.pull('a',[1,2]) self.assertEqual(res[1],var_dict['a'],"Error pulling a variable after a push operation to a subset of servers") res = self.CL.pull('b',[1,2]) self.assertEqual(res[2],var_dict['b'],"Error pulling a variable after a push operation to a subset of servers") # Push/Pull variable to/from all servers var_dict={} var_dict['c'] = False var_dict['d'] = "bla" self.CL.push(var_dict) res = self.CL.pull('c') self.assertEqual(res[1],var_dict['c'],"Error pulling a variable after a push operation to all servers") res = self.CL.pull('d') self.assertEqual(res[2],var_dict['d'],"Error pulling a variable after a push operation to all servers") # Run various commands in parallel self.CL.pgc({1:'ya=3',2:'ya="b"'}) res = self.CL.pull('ya',[1,2]) self.assertEqual(res,{1: 3, 2: 'b'},"Error running various commands in parallel") # Async execution of a job in a subset of servers via odo jobIds = self.CL.odo("time.sleep(2.5)",1) status = self.CL.check_job(jobIds) ntries = 0 while status == False and ntries < 10: ntries += 1 time.sleep(1) status = self.CL.check_job(jobIds) self.assertEqual(status,True,"Error executing a job asynchronously via odo") # Async execution of a job in a subset of servers via do_and_record with defined target server jobIds = self.sc.do_and_record("time.sleep(2.5)",1) status = self.CL.check_job(jobIds) ntries = 0 while status == False and ntries < 10: ntries += 1 time.sleep(1) status = self.CL.check_job(jobIds) self.assertEqual(status,True,"Error executing a job asynchronously via do_and_record with defined target server") # Async execution of a job in a subset of servers via do_and_record with undefined target server jobIds = self.sc.do_and_record("time.sleep(2.5)") status = self.CL.check_job(jobIds) ntries = 0 while status == False and ntries < 10: ntries += 1 time.sleep(1) status = self.CL.check_job(jobIds) self.assertEqual(status,True,"Error executing a job asynchronously via do_and_record with undefined target server") # Re-throw exception jobIds = self.CL.odo("1/0",[1, 2]) ntries = 0 res = False rethrow = False while res == False and ntries < 10: try: res = self.CL.check_job(jobIds) time.sleep(1) ntries += 1 except: rethrow = True break self.assertEqual(rethrow,True,"Exception not retrown") self.assertEqual(str(sys.exc_info()[1]).find("ZeroDivisionError:")>=0, True, "Trace-back should contain ZeroDivisionError") # Check queue status jobIds = self.CL.odo("time.sleep(5)",1) time.sleep(1) status = self.sc.get_status() self.assertEqual(len(status)-1,len(self.CL.get_command_request_list()),"Error retrieving job queue status")
def setUp(self): MPIInterface.set_log_mode('redirect') self.sc = MPIInterface.getCluster() self.CL = self.sc._cluster