def load_test(coordination_url, number_jobs, number_nodes, number_cores_per_node): print "\n**************************************************************************************************************************************************\n" print ("Start test scenario - #nodes:%d, #cores/node:%d, #jobs: %d, coordination-url:%s, lrms-url:%s"% (number_nodes, number_cores_per_node, number_jobs, coordination_url, LRMS_URL)) print "\n**************************************************************************************************************************************************\n" starttime=time.time() ########################################################################################## # Start BigJob # Parameter for BigJob lrms_url = LRMS_URL workingdirectory="/N/u/luckow/src/bigjob-performance/agent" # working directory for agent # start pilot job (bigjob_agent) print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(coordination_url) bj.start_pilot_job(lrms_url=lrms_url, number_nodes=number_nodes, processes_per_node=number_cores_per_node, working_directory=workingdirectory ) queueing_time = None subjob_submission_time = None pilot_state = str(bj.get_state_detail()) if pilot_state=="Running" and queueing_time==None: queueing_time=time.time()-starttime print "*** Pilot State: " + pilot_state + " queue time: " + str(queueing_time) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + pilot_state ########################################################################################## # Submit SubJob through BigJob jobs = [] job_start_times = {} job_states = {} for i in range(0, number_jobs): jd = saga.job.description() jd.executable = "/bin/date" jd.number_of_processes = "1" jd.spmd_variation = "single" jd.arguments = [""] jd.working_directory = os.getcwd() jd.output = "sj-stdout-"+str(i)+".txt" jd.error = "sj-stderr-"+str(i)+".txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) jobs.append(sj) job_start_times[sj]=time.time() job_states[sj] = sj.get_state() if pilot_state != "Running": pilot_state = str(bj.get_state_detail()) if pilot_state=="Running" and queueing_time==None: queueing_time=time.time()-starttime print "*** Pilot State: " + pilot_state + " queue time: " + str(queueing_time) subjob_submission_time = time.time()-starttime # busy wait for completion while 1: pilot_state = str(bj.get_state_detail()) if pilot_state=="Running" and queueing_time==None: queueing_time=time.time()-starttime print "*** Pilot State: " + pilot_state + " queue time: " + str(queueing_time) finish_counter=0 result_map = {} for i in range(0, number_jobs): old_state = job_states[jobs[i]] state = jobs[i].get_state() if result_map.has_key(state)==False: result_map[state]=1 else: result_map[state] = result_map[state]+1 #pdb.set_trace() if old_state != state: print "Job " + str(jobs[i]) + " changed from: " + old_state + " to " + state if old_state != state and has_finished(state)==True: print "Job: " + str(jobs[i]) + " Runtime: " + str(time.time()-job_start_times[jobs[i]]) + " s." if has_finished(state)==True: finish_counter = finish_counter + 1 job_states[jobs[i]]=state print "Pilot State: %s; %d/%d jobs finished"%(pilot_state,finish_counter,number_jobs) if finish_counter >= number_jobs-1 or pilot_state == "Failed": break time.sleep(2) runtime = time.time()-starttime #print "Runtime: " + str(runtime) + " s; Runtime per Job: " + str(runtime/NUMBER_JOBS) ########################################################################################## # Cleanup - stop BigJob result = ("%d,%d,%d,%s,%s,%s,%s,%s"% (number_nodes, number_cores_per_node, number_jobs, str(runtime), str(queueing_time),coordination_url, LRMS_URL,str(subjob_submission_time))) result_tab = ("%d\t%d\t%d\t%s\t%s\t%s\t%s"% (number_nodes, number_cores_per_node, number_jobs, str(runtime), str(queueing_time), coordination_url, LRMS_URL)) print ("#Nodes\t#cores/node\t#jobs\tRuntime\tQueuing Time\tCoordination URL\tLRMS URL") print result_tab bj.cancel() # hack: delete manually pbs jobs of user os.system("qstat -u `whoami` | grep -o ^[0-9]* |xargs qdel") #os.system("saga-advert remove_directory advert://advert.cct.lsu.edu:8080/bigjob") return result
def map_job_submit(self): ########################################################################################## print " >>> Starting BigJob ..................... \n" jobs = [] job_start_times = {} job_states = {} print " >>> Create bigjob with advert service at ... ", self.advert_host + "\n" print " >> BigJob parameters " + self.advert_host + "\n" print " >> resource url " + self.resource_url + "\n" print " >> Number of processes " + str(self.number_of_processes) + "\n" print " >> Queue " + str(self.queue) + "\n" print " >> Allocation " + str(self.allocation) + "\n" print " >> Working directory" + self.workingdirectory + "\n" print " >> userproxy " + str(self.userproxy) + "\n" print " >> walltime " + str(self.walltime) + "\n" print " >> ppn " + str(self.ppn) + "\n" self.__bj = bigjob(self.advert_host) self.__bj.start_pilot_job(self.resource_url, None, self.number_of_processes, self.queue, self.allocation, self.workingdirectory, self.userproxy, self.walltime, self.ppn) i = 0 for u in self.__chunk_list: uname = "-".join(u) i = i + 1 print " >>> chunk path/name to be submitted to map subjob " + uname # create job description try: jd = saga.job.description() jd.executable = self.__mapper jd.number_of_processes = self.npworkers jd.spmd_variation = "single" jd.arguments = u + [str(self.__nbr_reduces)] + self.maparg jd.working_directory = saga.url(self.__tmp_dir).path jd.output = "stdout-map" + str(i) + ".txt" jd.error = "stderr-map" + str(i) + ".txt" sj = subjob() sj.submit_job(self.__bj.pilot_url, jd) print "Submited sub-job " + uname + "." jobs.append(sj) job_start_times[sj] = time.time() job_states[sj] = sj.get_state() except: #traceback.print_exc(file=sys.stdout) print " Map Job failed. Cancelling bigjob......" self.__bj.cancel() sys.exit(0) try: self.__bj.cancel() except: pass print "************************ All Jobs submitted ************************" print " No of map subjobs created - " + str(len(jobs)) # Wait for task completion of map tasks - synchronization ############################################################################################ # Wait for task completion of map tasks - synchronization wait_for_all_jobs(jobs, job_start_times, job_states, 5)
def map_job_submit(self): ########################################################################################## print " >>> Starting BigJob ..................... \n" jobs = [] job_start_times = {} job_states = {} print " >>> Create bigjob with advert service at ... " , self.advert_host + "\n" print " >> BigJob parameters " + self.advert_host + "\n" print " >> resource url " + self.resource_url + "\n" print " >> Number of processes " + str(self.number_of_processes) + "\n" print " >> Queue " + str(self.queue) + "\n" print " >> Allocation " + str(self.allocation) + "\n" print " >> Working directory" + self.workingdirectory + "\n" print " >> userproxy " + str(self.userproxy) + "\n" print " >> walltime " + str(self.walltime) + "\n" print " >> ppn " + str(self.ppn) + "\n" self.__bj = bigjob(self.advert_host) self.__bj.start_pilot_job( self.resource_url, None, self.number_of_processes, self.queue, self.allocation, self.workingdirectory, self.userproxy, self.walltime, self.ppn) i=0 for u in self.__chunk_list: uname="-".join(u) i=i+1 print " >>> chunk path/name to be submitted to map subjob " + uname # create job description try: jd = saga.job.description() jd.executable = self.__mapper jd.number_of_processes = self.npworkers jd.spmd_variation = "single" jd.arguments = u + [str(self.__nbr_reduces)] + self.maparg jd.working_directory = saga.url(self.__tmp_dir).path jd.output = "stdout-map" + str(i) + ".txt" jd.error = "stderr-map" + str(i) + ".txt" sj = subjob() sj.submit_job(self.__bj.pilot_url, jd) print "Submited sub-job " + uname + "." jobs.append(sj) job_start_times[sj]=time.time() job_states[sj] = sj.get_state() except: #traceback.print_exc(file=sys.stdout) print " Map Job failed. Cancelling bigjob......" self.__bj.cancel() sys.exit(0) try: self.__bj.cancel() except: pass print "************************ All Jobs submitted ************************" print " No of map subjobs created - " + str(len(jobs)) # Wait for task completion of map tasks - synchronization ############################################################################################ # Wait for task completion of map tasks - synchronization wait_for_all_jobs(jobs, job_start_times,job_states, 5)