class MPISubmission: def __init__(self, np, execfname ): self.debug = DEBUG self.np = np self.mpdPort = '' self.nfsTmp = NFS_PREFIX + gethostname(True) self.execfname = execfname self.servthread = None self.rand = genRandom() self.tmpPath = self._create_tmpdir() self.submitFile = TemplateFile( '' , SUBM_FNAME, self.tmpPath, SUBM_FNAME + '_' + self.rand ) self.clientFile = TemplateFile( '' , CLNT_FNAME, self.tmpPath, CLNT_FNAME + '_' + self.rand ) self.hostfname = self.tmpPath + 'hosts_' + self.rand self.keyfname = self.tmpPath + self.rand + '.key' self.hostlist = [] # store a list of [username, ip, port] self.env = os.environ self.env['PATH'] = self.nfsTmp + '/' + MPDBIN_PATH + ':' + self.env['PATH'] # remove tmp dir and all its content def _rm_tmpdir(self, dst='.'): tpath = dst + '/tmp' + self.rand try: if os.access( tpath, os.F_OK) == True: shutil.rmtree( tpath ) except os.error as e: sys.exit('Error: cannot remove temp directory - ' + e ) def _create_tmpdir(self, dst='.'): try: os.makedirs( dst + '/tmp' + self.rand ) except os.error as e: sys.exit('Error: cannot create temp directory - ' + e ) return 'tmp' + self.rand + '/' def _start_mpd(self): # make sure no other mpds are running subprocess.call( 'mpdallexit', env=self.env, stdout=FNULL, stderr=FNULL) time.sleep(1.0) # wait for old mpd ring to be torn down # starting server's mpd mpdconf_path = os.getcwd()+'/'+self.tmpPath[:-1] createMpdConf( self.rand, mpdconf_path ) self.env['MPD_CONF_FILE'] = mpdconf_path + '/.mpd.conf' subprocess.Popen(['mpd', '--daemon'], env=self.env) time.sleep(1.0) # wait for mpd to fully start # determine mpd listening port from mpdtrace output process = subprocess.Popen(['mpdtrace', '-l'], stdout=subprocess.PIPE, env=self.env) process.wait() traceout = process.communicate()[0] port = extractPort(traceout) if not port.isdigit(): sys.exit('Error starting mpd : ' + traceout ) self.mpdPort = port def start(self): # Copy exe file into local ganfs directory self._create_tmpdir( self.nfsTmp ) shutil.copy2( self.execfname, self.nfsTmp + '/' + self.tmpPath ) self._start_mpd() # start local mpd self._prepare_submission_files() # prepare client script and condor submit file self._gen_ssh_keys() # generate ssh key pairs # start a listening server self.servthread = Thread(target=CallbackServ(self.np - 1, self.hostfname, PORT).serv) self.servthread.setDaemon(True) self.servthread.start() if self.debug: print "submit condor with file " + str(self.submitFile) p = subprocess.call(['condor_submit', str(self.submitFile) ] ) if p != 0: sys.exit('Error: condor_submit return ' + str(p)) # if the submission is successful, wait for the server print 'Waiting for ' + str(self.np-1) + ' workers to response ....', sys.stdout.flush() self.servthread.join() print 'finished' self._read_hosts_info() # read info from the collected hosts # Waiting for mpd ring to be ready print 'Waiting for mpd ring to be ready .......', limit = 120 retry = 0 while retry < limit : time.sleep(1.0) # wait # testing mpd connection process = subprocess.Popen(['mpdtrace', '-l'], env=self.env, stdout=subprocess.PIPE ) process.wait() trace = process.communicate()[0] retry += 1 port = extractPort(trace) num = len( trace.split('\n') ) if port.isdigit() and (num == self.np + 1): print 'done' if self.debug: print '\nMPD trace:\n' + trace break # Check whether mpdtrace return enough mpd nodes if len(trace.split('\n')) < self.np + 1 : print 'failed' subprocess.call( 'mpdallexit', env=self.env, stdout=FNULL, stderr=FNULL) subprocess.call(['condor_rm', '-all']) sys.exit('Error: not enough mpd nodes in the ring') # Run mpi job execdir = self.nfsTmp + '/' + self.tmpPath subprocess.call(['mpiexec', '-n', str(self.np), execdir + self.execfname.split('/')[-1]], env=self.env) # mpi job is finished for host in self.hostlist: # notify all workers hostserv = xmlrpclib.Server( "http://" + host[0] + ":" + host[4] ) hostserv.terminate() subprocess.call( 'mpdallexit', env=self.env, stdout=FNULL, stderr=FNULL) # tear down mpd ring self._rm_tmpdir( self.nfsTmp ) # remove exec file from ganfs dir self._rm_tmpdir() # remove temp directory def _prepare_submission_files(self): # Prepare condor submission file self.submitFile.prepare_file( [ ['<q.np>', str(self.np-1)], ['<ssh.pub.key>', self.tmpPath + AUTH_FNAME ], ['<fullpath.client.script>', str(self.clientFile) ], ['<output.dir>', OUTPUT_DIR + '/' ], ['<client.script>', self.clientFile.out_fname() ] ] ) # Prepare client side python script if self.debug: print 'serv ipop ip = ' + get_ipop_ip() self.clientFile.prepare_file( [ [ '<serv.ip>', get_ipop_ip() ], [ '<serv.port>', str(PORT) ], [ '<mpd.port>', self.mpdPort ], [ '<mpd.path>', self.nfsTmp +'/'+ MPDBIN_PATH], [ '<rand>', self.rand ] ], True ) def _read_hosts_info(self): with open( self.hostfname, 'r') as hostf: for line in hostf: self.hostlist.append( line.rstrip().split(":") ) hostf.close() if self.debug: print '\nhost list:' for host in self.hostlist: print host def _create_mpd_hosts(self): with open( self.tmpPath + 'mpd.hosts', 'w') as mpdhostf: for host in self.hostlist: # write hostname and number of cpus only mpdhostf.write( host[0] + ':' + host[1] + '\n' ) mpdhostf.close() # Generate ssh key pairs for MPI ring def _gen_ssh_keys(self): argstr = str.split( "ssh-keygen -q -t rsa" ) argstr.extend( ["-N", ''] ) argstr.extend( ["-f", self.keyfname] ) argstr.extend( ["-C", self.rand] ) p = subprocess.call( argstr ) if p != 0: sys.exit('Error: ssh-keygen return ' + str(p)) # copy the public key file into 'authorized_keys' file for client's sshd shutil.copyfile( self.keyfname + '.pub', self.tmpPath + AUTH_FNAME )
class HadoopCluster: def __init__(self, np ): self.debug = DEBUG self.np = np self.nfsTmp = NFS_PREFIX + gethostname(True) self.servthread = None self.rand = 'hadoopAAA' self.tmpPath = self._create_tmpdir() self.submitFile = TemplateFile( '' , SUBM_FNAME, self.tmpPath, SUBM_FNAME + '_' + self.rand ) self.clientFile = TemplateFile( '' , CLNT_FNAME, self.tmpPath, CLNT_FNAME + '_' + self.rand ) self.hostfname = self.tmpPath + 'hosts_' + self.rand self.hostlist = [] # store a list of [username, ip, port] self.env = os.environ self.env['HADOOP_HEAPSIZE'] = str(128) self.fullTmpPath = (os.getcwd() + '/' + self.tmpPath)[:-1] self.env['HADOOP_CONF_DIR'] = self.fullTmpPath self.env['HADOOP_PID_DIR'] = self.fullTmpPath self.env['HADOOP_LOG_DIR'] = self.fullTmpPath self.env['HADOOP_HOME'] = self.nfsTmp + '/' + HADP_PATH self.env['JAVA_HOME'] = self.nfsTmp + '/' + JAVA_PATH # testing and add hadoop/bin into PATH binPath = self.nfsTmp + '/' + HADP_PATH + '/bin' if string.find( self.env['PATH'], binPath ) == -1: self.env['PATH'] = binPath + ':' + self.env['PATH'] # remove tmp dir and all its content def _rm_tmpdir(self, dst='.'): tpath = dst + '/tmp' + self.rand if os.path.isdir( tpath ): try: if os.access( tpath, os.F_OK) == True: shutil.rmtree( tpath ) except os.error as e: sys.exit('Error: cannot remove temp directory - ' + e ) def _create_tmpdir(self, dst='.'): tmpdir = dst + '/tmp' + self.rand if not os.path.isdir( tmpdir ): try: os.makedirs( tmpdir ) except os.error as e: sys.exit('Error: cannot create temp directory - ' + e ) return 'tmp' + self.rand + '/' def _stop_hadoop(self): subprocess.call( ['hadoop-daemon.sh', 'stop', 'datanode'], env=self.env, stdout=FNULL, stderr=FNULL) subprocess.call( ['hadoop-daemon.sh', 'stop', 'namenode'], env=self.env, stdout=FNULL, stderr=FNULL) subprocess.call( ['hadoop-daemon.sh', 'stop', 'tasktracker'], env=self.env, stdout=FNULL, stderr=FNULL) subprocess.call( ['hadoop-daemon.sh', 'stop', 'jobtracker'], env=self.env, stdout=FNULL, stderr=FNULL) def _start_hadoop_dfs(self): # starting server's namenode p1 = subprocess.Popen( ['echo', 'Y'], stdout=subprocess.PIPE, stderr=FNULL) subprocess.call( ['hdfs', 'namenode', '-format'], env=self.env, stdin=p1.stdout, stdout=FNULL, stderr=FNULL ) p1.stdout.close() print 'Starting a namenode (this may take a while) .... ', sys.stdout.flush() subprocess.Popen( ['hadoop-daemon.sh', 'start', 'namenode'], env=self.env, stdout=FNULL, stderr=FNULL) # use -report to wait for namenode to finish starting up subprocess.call(['hdfs', 'dfsadmin' ,'-report'], env=self.env, stdout=FNULL, stderr=FNULL ) print 'done' # Start local datanode print 'Starting a local datanode' subprocess.Popen( ['hadoop-daemon.sh', 'start', 'datanode'], env=self.env, stdout=FNULL, stderr=FNULL) def _start_hadoop_mapred(self): # Start local jobtracker print 'Starting a local jobtracker' subprocess.Popen( ['hadoop-daemon.sh', 'start', 'jobtracker'], env=self.env, stdout=FNULL, stderr=FNULL) # Start local tasktracker print 'Starting a local tasktracker\n' subprocess.Popen( ['hadoop-daemon.sh', 'start', 'tasktracker'], env=self.env, stdout=FNULL, stderr=FNULL) def start(self): # Create conf file into local temp directory confFile = TemplateFile( '' , CONF_TMP_FNAME, self.tmpPath, CONF_FNAME ) confFile.prepare_file([ ['<namenode.hostname>', gethostname()] ]) hdfsFile = TemplateFile( '', HDFS_TMP_FNAME, self.tmpPath, HDFS_FNAME ) hdfsFile.prepare_file([ ['<name.dir>', self.fullTmpPath + '/name' ], ['<data.dir>', self.fullTmpPath + '/data' ] ] ) mprdFile = TemplateFile( '', MPRD_TMP_FNAME, self.tmpPath, MPRD_FNAME ) mprdFile.prepare_file([ ['<jobtracker.hostname>', gethostname()], ['<jobtracker.port>', str(MPRD_PORT) ] ]) # Prepare and copy hadoop-env.sh into local temp directory envFile = TemplateFile('', HENV_TMP_FNAME, self.tmpPath, HENV_FNAME ) envFile.prepare_file([ ['<java.home>', self.env['JAVA_HOME']], ['<hadoop.home>', self.env['HADOOP_HOME']], ['<conf.dir>', self.env['HADOOP_CONF_DIR']], ['<pid.dir>', self.env['HADOOP_PID_DIR']], ['<log.dir>', self.env['HADOOP_LOG_DIR']], ['<path>', self.env['PATH']] ], True ) self._start_hadoop_dfs() # start local hadoop namenode & datanode self._prepare_submission_files() # prepare client script and condor submit file # start a listening server self.servthread = Thread(target=CallbackServ(self.np-1, self.hostfname, PORT).serv) self.servthread.setDaemon(True) self.servthread.start() if self.debug: print "submit condor with file " + str(self.submitFile) p = subprocess.call(['condor_submit', str(self.submitFile) ] ) if p != 0: sys.exit('Error: condor_submit return ' + str(p)) # if the submission is successful, wait for the server print 'Waiting for ' + str(self.np-1) + ' workers to response ....', sys.stdout.flush() self.servthread.join() print 'finished' self._read_hosts_info() # read info from the collected hosts # Waiting for hadoop cluster to be ready print '\nWaiting for ' + str(self.np) + ' datanodes to be ready ', print '(This may take a while) .... ', sys.stdout.flush() limit = 180 retry = 0 while retry < limit : time.sleep(1.0) # testing hadoop cluster process = subprocess.Popen(['hdfs', 'dfsadmin' ,'-report'], env=self.env, stdout=subprocess.PIPE, stderr=FNULL ) process.wait() trace = process.communicate()[0] retry += 1 # extract datanodes count start = trace.find("Datanodes available:") end = trace.find("(", start ) count = '0' if start > 0 and end > 0: count = trace[start:end].split()[-1] if count.isdigit() and (int(count) == self.np ): print 'success' break # Check whether a report return enough datanodes if retry >= limit : print 'fail' self.stop() sys.exit('Timeout: not enough datanodes in the cluster') self._start_hadoop_mapred() print 'Attention: Please source ' + self.tmpPath + HENV_FNAME +' before using hadoop.\n' def stop(self): self._read_hosts_info() # read info from tmp dir if len(self.hostlist) == 0 : sys.exit('Error: no existing hadoop cluster info' ) for host in self.hostlist: # notify all workers hostserv = xmlrpclib.Server( "http://" + host[0] + ":" + host[3] ) try: hostserv.terminate() except socket.error: print 'host ' + host[0] + ':' + host[3] + ' is not respending.' self._stop_hadoop() # stop hadoop self._rm_tmpdir() # remove temp directory def _prepare_submission_files(self): # Prepare condor submission file self.submitFile.prepare_file( [ ['<q.np>', str(self.np-1)], ['<fullpath.client.script>', str(self.clientFile) ], ['<output.dir>', OUTPUT_DIR + '/' ], ['<core.config.file>', self.tmpPath + CONF_FNAME ], ['<mprd.config.file>', self.tmpPath + MPRD_FNAME ], ['<hdfs.config.tmp.file>', HDFS_TMP_FNAME ], ['<env.config.file>', self.tmpPath + HENV_FNAME ], ['<client.script>', self.clientFile.out_fname() ] ] ) # Prepare client side python script if self.debug: print 'serv ipop ip = ' + get_ipop_ip() self.clientFile.prepare_file( [ [ '<serv.ip>', get_ipop_ip() ], [ '<serv.port>', str(PORT) ], [ '<hadp.path>', self.nfsTmp + '/' + HADP_PATH ], [ '<java.path>', self.nfsTmp + '/' + JAVA_PATH ], [ '<hdfs.config.file>', HDFS_FNAME ], [ '<hdfs.config.tmp.file>', HDFS_TMP_FNAME ], [ '<rand>', self.rand ] ], True ) def _read_hosts_info(self): self.hostlist = [] with open( self.hostfname, 'r') as hostf: for line in hostf: self.hostlist.append( line.rstrip().split(":") ) hostf.close() if self.debug: print '\nhost list:' for host in self.hostlist: print host