def reduce_job_submit(self): ########################################################################################## part_list_argument = [] jobs = [] job_start_times = {} job_states = {} print " moving files .... to output location.... " + saga.url( self.__output_dir).path file_transfer_time = time.time() self.__sorted_partition_file_names = glob.glob( saga.url(self.__tmp_dir).path + "/*sorted-part-*") file_transfer_size = 0 for u in self.__sorted_partition_file_names: #print " >>> Partition to be moved ... " + u + " to " + self.__output_dir part_file = saga.filesystem.file(u) file_transfer_size = file_transfer_size + part_file.get_size() part_file.move(self.__output_dir) file_transfer_comp = time.time() - file_transfer_time print " Time taken to transfer partition files to output location " + str( round(file_transfer_comp, 3)) + "\n\n" print " The total file size transferred is(MB): " + str( round((file_transfer_size / (1024.0 * 1024.0)), 3)) + "\n\n" for i in range(0, self.__nbr_reduces): part_list_string = "" for u in self.__sorted_partition_file_names: file_name = (os.path.split(u))[1] if str(u.split("-")[-1:][0]) == str(i): part_list_string = part_list_string + ":" + saga.url( self.__output_dir).path + "/" + file_name part_list_argument.append(part_list_string) for i in part_list_argument: print " The argument list is " + i # create job description try: jd2 = saga.job.description() jd2.executable = self.__reducer jd2.number_of_processes = self.npworkers jd2.spmd_variation = "single" jd2.arguments = [i] + self.redarg jd2.working_directory = saga.url(self.__output_dir).path jd2.output = "stdout_reduce" + str(self.__nbr_reduce_jobs) jd2.error = "stderr-reduce" + str(self.__nbr_reduce_jobs) sj = subjob() sj.submit_job(self.__bj.pilot_url, jd2) print "Submited Reduce sub-job " + str(self.__nbr_reduce_jobs) jobs.append(sj) job_start_times[sj] = time.time() job_states[sj] = sj.get_state() self.__nbr_reduce_jobs = self.__nbr_reduce_jobs + 1 except: #traceback.print_exc(file=sys.stdout) print " Reduce Job failed. Cancelling bigjob......" self.__bj.cancel() sys.exit(0) try: self.__bj.cancel() except: pass print "************************ All Reduce Jobs submitted ************************" print " Reduce subjobs created " # Wait for task completion of map tasks - synchronization ############################################################################################ # Wait for task completion of map tasks - synchronization wait_for_all_jobs(jobs, job_start_times, job_states, 5) ############################################################################################ self.__bj.cancel()
def map_job_submit(self): ########################################################################################## print " >>> Starting BigJob ..................... \n" jobs = [] job_start_times = {} job_states = {} print " >>> Create bigjob with advert service at ... ", self.advert_host + "\n" print " >> BigJob parameters " + self.advert_host + "\n" print " >> resource url " + self.resource_url + "\n" print " >> Number of processes " + str(self.number_of_processes) + "\n" print " >> Queue " + str(self.queue) + "\n" print " >> Allocation " + str(self.allocation) + "\n" print " >> Working directory" + self.workingdirectory + "\n" print " >> userproxy " + str(self.userproxy) + "\n" print " >> walltime " + str(self.walltime) + "\n" print " >> ppn " + str(self.ppn) + "\n" self.__bj = bigjob(self.advert_host) self.__bj.start_pilot_job(self.resource_url, None, self.number_of_processes, self.queue, self.allocation, self.workingdirectory, self.userproxy, self.walltime, self.ppn) i = 0 for u in self.__chunk_list: uname = "-".join(u) i = i + 1 print " >>> chunk path/name to be submitted to map subjob " + uname # create job description try: jd = saga.job.description() jd.executable = self.__mapper jd.number_of_processes = self.npworkers jd.spmd_variation = "single" jd.arguments = u + [str(self.__nbr_reduces)] + self.maparg jd.working_directory = saga.url(self.__tmp_dir).path jd.output = "stdout-map" + str(i) + ".txt" jd.error = "stderr-map" + str(i) + ".txt" sj = subjob() sj.submit_job(self.__bj.pilot_url, jd) print "Submited sub-job " + uname + "." jobs.append(sj) job_start_times[sj] = time.time() job_states[sj] = sj.get_state() except: #traceback.print_exc(file=sys.stdout) print " Map Job failed. Cancelling bigjob......" self.__bj.cancel() sys.exit(0) try: self.__bj.cancel() except: pass print "************************ All Jobs submitted ************************" print " No of map subjobs created - " + str(len(jobs)) # Wait for task completion of map tasks - synchronization ############################################################################################ # Wait for task completion of map tasks - synchronization wait_for_all_jobs(jobs, job_start_times, job_states, 5)
def reduce_job_submit(self): ########################################################################################## part_list_argument = [] jobs = [] job_start_times = {} job_states = {} print " moving files .... to output location.... " + saga.url(self.__output_dir).path file_transfer_time = time.time() self.__sorted_partition_file_names = glob.glob(saga.url(self.__tmp_dir).path + "/*sorted-part-*") file_transfer_size = 0 for u in self.__sorted_partition_file_names: #print " >>> Partition to be moved ... " + u + " to " + self.__output_dir part_file = saga.filesystem.file(u) file_transfer_size = file_transfer_size + part_file.get_size() part_file.move(self.__output_dir) file_transfer_comp = time.time() - file_transfer_time print " Time taken to transfer partition files to output location " + str( round(file_transfer_comp,3)) + "\n\n" print " The total file size transferred is(MB): " + str ( round((file_transfer_size/(1024.0 * 1024.0)),3) ) + "\n\n" for i in range(0,self.__nbr_reduces): part_list_string ="" for u in self.__sorted_partition_file_names: file_name=(os.path.split(u))[1] if str(u.split("-")[-1:][0]) == str(i): part_list_string = part_list_string + ":" + saga.url(self.__output_dir).path + "/" + file_name part_list_argument.append(part_list_string) for i in part_list_argument: print " The argument list is " + i # create job description try: jd2 = saga.job.description() jd2.executable = self.__reducer jd2.number_of_processes = self.npworkers jd2.spmd_variation = "single" jd2.arguments = [i] + self.redarg jd2.working_directory = saga.url(self.__output_dir).path jd2.output = "stdout_reduce" + str(self.__nbr_reduce_jobs) jd2.error = "stderr-reduce" + str(self.__nbr_reduce_jobs) sj = subjob() sj.submit_job(self.__bj.pilot_url, jd2) print "Submited Reduce sub-job " + str(self.__nbr_reduce_jobs) jobs.append(sj) job_start_times[sj]=time.time() job_states[sj] = sj.get_state() self.__nbr_reduce_jobs = self.__nbr_reduce_jobs + 1 except: #traceback.print_exc(file=sys.stdout) print " Reduce Job failed. Cancelling bigjob......" self.__bj.cancel() sys.exit(0) try: self.__bj.cancel() except: pass print "************************ All Reduce Jobs submitted ************************" print " Reduce subjobs created " # Wait for task completion of map tasks - synchronization ############################################################################################ # Wait for task completion of map tasks - synchronization wait_for_all_jobs(jobs, job_start_times, job_states, 5) ############################################################################################ self.__bj.cancel()
def load_test(coordination_url, number_jobs, number_nodes, number_cores_per_node): print "\n**************************************************************************************************************************************************\n" print ("Start test scenario - #nodes:%d, #cores/node:%d, #jobs: %d, coordination-url:%s, lrms-url:%s"% (number_nodes, number_cores_per_node, number_jobs, coordination_url, LRMS_URL)) print "\n**************************************************************************************************************************************************\n" starttime=time.time() ########################################################################################## # Start BigJob # Parameter for BigJob lrms_url = LRMS_URL workingdirectory="/N/u/luckow/src/bigjob-performance/agent" # working directory for agent # start pilot job (bigjob_agent) print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(coordination_url) bj.start_pilot_job(lrms_url=lrms_url, number_nodes=number_nodes, processes_per_node=number_cores_per_node, working_directory=workingdirectory ) queueing_time = None subjob_submission_time = None pilot_state = str(bj.get_state_detail()) if pilot_state=="Running" and queueing_time==None: queueing_time=time.time()-starttime print "*** Pilot State: " + pilot_state + " queue time: " + str(queueing_time) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + pilot_state ########################################################################################## # Submit SubJob through BigJob jobs = [] job_start_times = {} job_states = {} for i in range(0, number_jobs): jd = saga.job.description() jd.executable = "/bin/date" jd.number_of_processes = "1" jd.spmd_variation = "single" jd.arguments = [""] jd.working_directory = os.getcwd() jd.output = "sj-stdout-"+str(i)+".txt" jd.error = "sj-stderr-"+str(i)+".txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) jobs.append(sj) job_start_times[sj]=time.time() job_states[sj] = sj.get_state() if pilot_state != "Running": pilot_state = str(bj.get_state_detail()) if pilot_state=="Running" and queueing_time==None: queueing_time=time.time()-starttime print "*** Pilot State: " + pilot_state + " queue time: " + str(queueing_time) subjob_submission_time = time.time()-starttime # busy wait for completion while 1: pilot_state = str(bj.get_state_detail()) if pilot_state=="Running" and queueing_time==None: queueing_time=time.time()-starttime print "*** Pilot State: " + pilot_state + " queue time: " + str(queueing_time) finish_counter=0 result_map = {} for i in range(0, number_jobs): old_state = job_states[jobs[i]] state = jobs[i].get_state() if result_map.has_key(state)==False: result_map[state]=1 else: result_map[state] = result_map[state]+1 #pdb.set_trace() if old_state != state: print "Job " + str(jobs[i]) + " changed from: " + old_state + " to " + state if old_state != state and has_finished(state)==True: print "Job: " + str(jobs[i]) + " Runtime: " + str(time.time()-job_start_times[jobs[i]]) + " s." if has_finished(state)==True: finish_counter = finish_counter + 1 job_states[jobs[i]]=state print "Pilot State: %s; %d/%d jobs finished"%(pilot_state,finish_counter,number_jobs) if finish_counter >= number_jobs-1 or pilot_state == "Failed": break time.sleep(2) runtime = time.time()-starttime #print "Runtime: " + str(runtime) + " s; Runtime per Job: " + str(runtime/NUMBER_JOBS) ########################################################################################## # Cleanup - stop BigJob result = ("%d,%d,%d,%s,%s,%s,%s,%s"% (number_nodes, number_cores_per_node, number_jobs, str(runtime), str(queueing_time),coordination_url, LRMS_URL,str(subjob_submission_time))) result_tab = ("%d\t%d\t%d\t%s\t%s\t%s\t%s"% (number_nodes, number_cores_per_node, number_jobs, str(runtime), str(queueing_time), coordination_url, LRMS_URL)) print ("#Nodes\t#cores/node\t#jobs\tRuntime\tQueuing Time\tCoordination URL\tLRMS URL") print result_tab bj.cancel() # hack: delete manually pbs jobs of user os.system("qstat -u `whoami` | grep -o ^[0-9]* |xargs qdel") #os.system("saga-advert remove_directory advert://advert.cct.lsu.edu:8080/bigjob") return result
def map_job_submit(self): ########################################################################################## print " >>> Starting BigJob ..................... \n" jobs = [] job_start_times = {} job_states = {} print " >>> Create bigjob with advert service at ... " , self.advert_host + "\n" print " >> BigJob parameters " + self.advert_host + "\n" print " >> resource url " + self.resource_url + "\n" print " >> Number of processes " + str(self.number_of_processes) + "\n" print " >> Queue " + str(self.queue) + "\n" print " >> Allocation " + str(self.allocation) + "\n" print " >> Working directory" + self.workingdirectory + "\n" print " >> userproxy " + str(self.userproxy) + "\n" print " >> walltime " + str(self.walltime) + "\n" print " >> ppn " + str(self.ppn) + "\n" self.__bj = bigjob(self.advert_host) self.__bj.start_pilot_job( self.resource_url, None, self.number_of_processes, self.queue, self.allocation, self.workingdirectory, self.userproxy, self.walltime, self.ppn) i=0 for u in self.__chunk_list: uname="-".join(u) i=i+1 print " >>> chunk path/name to be submitted to map subjob " + uname # create job description try: jd = saga.job.description() jd.executable = self.__mapper jd.number_of_processes = self.npworkers jd.spmd_variation = "single" jd.arguments = u + [str(self.__nbr_reduces)] + self.maparg jd.working_directory = saga.url(self.__tmp_dir).path jd.output = "stdout-map" + str(i) + ".txt" jd.error = "stderr-map" + str(i) + ".txt" sj = subjob() sj.submit_job(self.__bj.pilot_url, jd) print "Submited sub-job " + uname + "." jobs.append(sj) job_start_times[sj]=time.time() job_states[sj] = sj.get_state() except: #traceback.print_exc(file=sys.stdout) print " Map Job failed. Cancelling bigjob......" self.__bj.cancel() sys.exit(0) try: self.__bj.cancel() except: pass print "************************ All Jobs submitted ************************" print " No of map subjobs created - " + str(len(jobs)) # Wait for task completion of map tasks - synchronization ############################################################################################ # Wait for task completion of map tasks - synchronization wait_for_all_jobs(jobs, job_start_times,job_states, 5)