Esempio n. 1
0
def load_test(coordination_url, number_jobs, number_nodes, number_cores_per_node):
    
    print "\n**************************************************************************************************************************************************\n"
    print ("Start test scenario - #nodes:%d, #cores/node:%d, #jobs: %d, coordination-url:%s, lrms-url:%s"%
          (number_nodes, number_cores_per_node, number_jobs, coordination_url, LRMS_URL))
    print "\n**************************************************************************************************************************************************\n"      
    
    starttime=time.time()
    ##########################################################################################
    # Start BigJob
    # Parameter for BigJob
    lrms_url = LRMS_URL
    workingdirectory="/N/u/luckow/src/bigjob-performance/agent"  # working directory for agent
   
    # start pilot job (bigjob_agent)
    print "Start Pilot Job/BigJob at: " + lrms_url
    bj = bigjob(coordination_url)
    bj.start_pilot_job(lrms_url=lrms_url,
                       number_nodes=number_nodes,
                       processes_per_node=number_cores_per_node,
                       working_directory=workingdirectory
                      )
        
    queueing_time = None    
    subjob_submission_time = None    
    pilot_state = str(bj.get_state_detail())
    if pilot_state=="Running" and queueing_time==None:
            queueing_time=time.time()-starttime
            print "*** Pilot State: " + pilot_state + " queue time: " + str(queueing_time)
    print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + pilot_state

    ##########################################################################################
    # Submit SubJob through BigJob
    jobs = []
    job_start_times = {}
    job_states = {}
    for i in range(0, number_jobs):
        jd = saga.job.description()
        jd.executable = "/bin/date"
        jd.number_of_processes = "1"
        jd.spmd_variation = "single"
        jd.arguments = [""]
        jd.working_directory = os.getcwd() 
        jd.output = "sj-stdout-"+str(i)+".txt"
        jd.error = "sj-stderr-"+str(i)+".txt"

        sj = subjob()
        sj.submit_job(bj.pilot_url, jd)
        jobs.append(sj)
        job_start_times[sj]=time.time()
        job_states[sj] = sj.get_state()

        if pilot_state != "Running":
            pilot_state = str(bj.get_state_detail())
            if pilot_state=="Running" and queueing_time==None:
                queueing_time=time.time()-starttime
                print "*** Pilot State: " + pilot_state + " queue time: " + str(queueing_time)

    subjob_submission_time = time.time()-starttime
    # busy wait for completion
    while 1:        
        pilot_state = str(bj.get_state_detail())
        if pilot_state=="Running" and queueing_time==None:
            queueing_time=time.time()-starttime
            print "*** Pilot State: " + pilot_state + " queue time: " + str(queueing_time)
        finish_counter=0
        result_map = {}
        for i in range(0, number_jobs):
            old_state = job_states[jobs[i]]
            state = jobs[i].get_state()
            if result_map.has_key(state)==False:
                result_map[state]=1
            else:
                result_map[state] = result_map[state]+1
            #pdb.set_trace()
            if old_state != state:
                print "Job " + str(jobs[i]) + " changed from: " + old_state + " to " + state
            if old_state != state and has_finished(state)==True:
                print "Job: " + str(jobs[i]) + " Runtime: " + str(time.time()-job_start_times[jobs[i]]) + " s."
            if has_finished(state)==True:
                finish_counter = finish_counter + 1                
            job_states[jobs[i]]=state

        print "Pilot State: %s; %d/%d jobs finished"%(pilot_state,finish_counter,number_jobs)
        if finish_counter >= number_jobs-1 or pilot_state == "Failed":
            break
        time.sleep(2)

    runtime = time.time()-starttime
    #print "Runtime: " + str(runtime) + " s; Runtime per Job: " + str(runtime/NUMBER_JOBS)
    ##########################################################################################
    # Cleanup - stop BigJob
    
    result = ("%d,%d,%d,%s,%s,%s,%s,%s"% 
             (number_nodes, number_cores_per_node, number_jobs, str(runtime), str(queueing_time),coordination_url, LRMS_URL,str(subjob_submission_time)))
    
    result_tab = ("%d\t%d\t%d\t%s\t%s\t%s\t%s"% 
             (number_nodes, number_cores_per_node, number_jobs, str(runtime), str(queueing_time), coordination_url, LRMS_URL))
    print ("#Nodes\t#cores/node\t#jobs\tRuntime\tQueuing Time\tCoordination URL\tLRMS URL")
    print result_tab
    
    bj.cancel()
    # hack: delete manually pbs jobs of user
    os.system("qstat -u `whoami` | grep -o ^[0-9]* |xargs qdel")
    #os.system("saga-advert remove_directory advert://advert.cct.lsu.edu:8080/bigjob")
    return result
Esempio n. 2
0
    def map_job_submit(self):
        ##########################################################################################
        print " >>> Starting BigJob ..................... \n"
        jobs = []
        job_start_times = {}
        job_states = {}
        print " >>> Create bigjob with advert service at ... ", self.advert_host + "\n"

        print " >> BigJob parameters " + self.advert_host + "\n"
        print " >> resource url " + self.resource_url + "\n"
        print " >> Number of processes " + str(self.number_of_processes) + "\n"
        print " >> Queue " + str(self.queue) + "\n"
        print " >> Allocation " + str(self.allocation) + "\n"
        print " >> Working directory" + self.workingdirectory + "\n"
        print " >> userproxy " + str(self.userproxy) + "\n"
        print " >> walltime " + str(self.walltime) + "\n"
        print " >> ppn " + str(self.ppn) + "\n"

        self.__bj = bigjob(self.advert_host)
        self.__bj.start_pilot_job(self.resource_url, None,
                                  self.number_of_processes, self.queue,
                                  self.allocation, self.workingdirectory,
                                  self.userproxy, self.walltime, self.ppn)
        i = 0
        for u in self.__chunk_list:
            uname = "-".join(u)
            i = i + 1
            print " >>> chunk path/name to be submitted to map subjob  " + uname

            # create job description
            try:

                jd = saga.job.description()
                jd.executable = self.__mapper
                jd.number_of_processes = self.npworkers
                jd.spmd_variation = "single"
                jd.arguments = u + [str(self.__nbr_reduces)] + self.maparg
                jd.working_directory = saga.url(self.__tmp_dir).path
                jd.output = "stdout-map" + str(i) + ".txt"
                jd.error = "stderr-map" + str(i) + ".txt"
                sj = subjob()
                sj.submit_job(self.__bj.pilot_url, jd)
                print "Submited sub-job " + uname + "."
                jobs.append(sj)
                job_start_times[sj] = time.time()
                job_states[sj] = sj.get_state()
            except:
                #traceback.print_exc(file=sys.stdout)
                print " Map Job failed. Cancelling bigjob......"
                self.__bj.cancel()
                sys.exit(0)
                try:
                    self.__bj.cancel()
                except:
                    pass

        print "************************ All Jobs submitted ************************"
        print " No of map subjobs created - " + str(len(jobs))
        # Wait for task completion of map tasks - synchronization

        ############################################################################################
        # Wait for task completion of map tasks - synchronization
        wait_for_all_jobs(jobs, job_start_times, job_states, 5)
    def map_job_submit(self):
    ##########################################################################################
        print " >>> Starting BigJob ..................... \n"
        jobs = []
        job_start_times = {}
        job_states = {}
        print " >>> Create bigjob with advert service at ... " , self.advert_host + "\n"
        
        print " >> BigJob parameters " + self.advert_host + "\n" 
        print " >> resource url " + self.resource_url + "\n"
        print " >> Number of processes " +  str(self.number_of_processes) + "\n" 
        print " >> Queue " + str(self.queue) + "\n" 
        print " >> Allocation " +  str(self.allocation) + "\n" 
        print " >> Working directory" + self.workingdirectory + "\n" 
        print " >> userproxy " + str(self.userproxy) + "\n" 
        print " >> walltime " + str(self.walltime) + "\n" 
        print " >> ppn " + str(self.ppn) + "\n"
                                  
        self.__bj = bigjob(self.advert_host) 
        self.__bj.start_pilot_job( self.resource_url, None, self.number_of_processes, self.queue, self.allocation, 
                                   self.workingdirectory, self.userproxy, self.walltime, self.ppn)
        i=0
        for u in self.__chunk_list:          
            uname="-".join(u)
            i=i+1
            print " >>> chunk path/name to be submitted to map subjob  " + uname

            # create job description
            try:
                
                jd = saga.job.description()
                jd.executable = self.__mapper
                jd.number_of_processes = self.npworkers
                jd.spmd_variation = "single"                
                jd.arguments = u + [str(self.__nbr_reduces)] + self.maparg
                jd.working_directory = saga.url(self.__tmp_dir).path
                jd.output = "stdout-map" + str(i) + ".txt"
                jd.error = "stderr-map" + str(i) + ".txt"
                sj = subjob()
                sj.submit_job(self.__bj.pilot_url, jd)
                print "Submited sub-job " + uname + "."
                jobs.append(sj)
                job_start_times[sj]=time.time()
                job_states[sj] = sj.get_state()
            except:
                #traceback.print_exc(file=sys.stdout)
                print " Map Job failed. Cancelling bigjob......"
                self.__bj.cancel()
                sys.exit(0)
                try:
                    self.__bj.cancel()
                except:
                    pass  
            
        print "************************ All Jobs submitted ************************"
        print " No of map subjobs created - " + str(len(jobs))
        # Wait for task completion of map tasks - synchronization      
        
        ############################################################################################
        # Wait for task completion of map tasks - synchronization    
        wait_for_all_jobs(jobs, job_start_times,job_states, 5)