def __start_bigjob(self, bj_dict): """ private method - starts a bigjob on the defined resource """ gram_url = bj_dict["resource_url"] logging.debug("start bigjob at: " + gram_url) bj = bigjob.bigjob(self.advert_host) ppn="1" if ("processes_per_node" in bj_dict): ppn=bj_dict["processes_per_node"] else: bj_dict["processes_per_node"]="1" walltime = 3600 if ("walltime" in bj_dict): walltime=bj_dict["walltime"] working_directory = (os.getcwd()+"/agent") if ("working_directory" in bj_dict): working_directory=bj_dict["working_directory"] bj.start_pilot_job(gram_url, bj_dict["bigjob_agent"], bj_dict["number_nodes"], bj_dict["queue"], bj_dict["allocation"], working_directory, None, walltime, ppn) bj_dict["bigjob"]=bj # store bigjob for later reference in dict bj_dict["free_cores"]=int(bj_dict["number_nodes"])*int(ppn) bj_dict["to_be_terminated"]=False # lock for modifying the number of free nodes bj_dict["lock"] = threading.Lock()
def __init__(self, pilot_compute_service=None, bigjob_object=None, pilot_compute_description=None, pilot_url=None): # for reconnecting """ Create/reconnect to a Pilot Compute. Keyword arguments: pilot_url -- restore from cp_id The implementation will attempt to reconnect to the PC instance referenced by the pilot_url. """ self.__subjobs = [] self.__pilot_compute_service = None if pilot_url==None: logger.debug("Create PilotCompute for BigJob: " + str(bigjob_object)) self.pilot_compute_description=pilot_compute_description self.__pilot_compute_service=pilot_compute_service self.__bigjob = bigjob_object else: logger.debug("Reconnect to an existing Pilot Compute") self.__bigjob = bigjob(pilot_url=pilot_url) # Store the URL of pilot compute service for later reference # This URL is used as central queue for a set of BJs in the # ComputeDataServiceDecentral if self.__pilot_compute_service!=None: self.coordination_queue = pilot_compute_service.coordination_queue
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue=None # if None default queue is used project=None # if None default allocation is used walltime=10 processes_per_node=1 number_of_processes = 1 workingdirectory="." # working directory for agent userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) #lrms_url = "ec2+ssh://localhost" # resource url to run on GCE lrms_url = "gce+ssh://locahost" ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/echo" #jd.executable = "$HOME/hello.sh" jd.number_of_processes = "1" jd.arguments = ["$HELLOWORLD"] jd.environment = ['HELLOWORLD=hello_world'] jd.input_data = ["hi", "ho"] # specify an optinal working directory if sub-job should be executed outside of bigjob sandbox #jd.working_directory = "/tmp" jd.output = "stdout.txt" jd.error = "stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) print "state: " + state if(state=="Failed" or state=="Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def __start_bigjob(self, bj_dict): """ private method - starts a bigjob on the defined resource """ gram_url = bj_dict["resource_url"] logger.debug("start bigjob at: " + gram_url) bj = bigjob(self.coordination_url) ppn = "1" if ("processes_per_node" in bj_dict): ppn = bj_dict["processes_per_node"] else: bj_dict["processes_per_node"] = "1" walltime = 3600 if ("walltime" in bj_dict): walltime = bj_dict["walltime"] working_directory = None if ("working_directory" in bj_dict): working_directory = bj_dict["working_directory"] bj_filetransfer = None if ("file_transfer" in bj_dict): bj_filetransfer = bj_dict["file_transfer"] bj.start_pilot_job( lrms_url=gram_url, number_nodes=bj_dict["number_of_processes"], queue=bj_dict["queue"], project=bj_dict["allocation"], working_directory=working_directory, walltime=walltime, processes_per_node=ppn, filetransfers=bj_filetransfer, external_queue=self.coordination_queue, pilot_compute_description=bj_dict["pilot_compute_description"]) return bj
def __init__(self, pilot_compute_service=None, bigjob_object=None, pilot_compute_description=None, pilot_url=None): # for reconnecting """ Create/reconnect to a Pilot Compute. Keyword arguments: pilot_url -- restore from cp_id The implementation will attempt to reconnect to the PC instance referenced by the pilot_url. """ self.__subjobs = [] self.__pilot_compute_service = None if pilot_url == None: logger.debug("Create PilotCompute for BigJob: " + str(bigjob_object)) self.pilot_compute_description = pilot_compute_description self.__pilot_compute_service = pilot_compute_service self.__bigjob = bigjob_object else: logger.debug("Reconnect to an existing Pilot Compute") self.__bigjob = bigjob(pilot_url=pilot_url) # Store the URL of pilot compute service for later reference # This URL is used as central queue for a set of BJs in the # ComputeDataServiceDecentral if self.__pilot_compute_service != None: self.coordination_queue = pilot_compute_service.coordination_queue
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue = None # if None default queue is used project = None # if None default allocation is used walltime = 10 processes_per_node = 8 number_nodes = 24 workingdirectory = os.getcwd() # working directory for agent userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "pbs://localhost" # resource url to run the jobs on localhost ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job(lrms_url, number_nodes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str( bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/hostname" jd.number_of_processes = "2" jd.spmd_variation = "single" jd.arguments = [""] #jd.working_directory = "/tmp" jd.output = "stdout.txt" jd.error = "stderr.txt" for i in range(0, 12): sj = subjob() sj.submit_job(bj.pilot_url, jd) ########################################################################################## # Cleanup - stop BigJob bj.wait() bj.cancel()
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue = None # if None default queue is used project = None # if None default allocation is used walltime = 10 processes_per_node = 1 number_of_processes = 1 workingdirectory = "." # working directory for agent userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) #lrms_url = "ec2+ssh://localhost" # resource url to run on GCE lrms_url = "gce+ssh://locahost" ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job(lrms_url, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str( bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/echo" #jd.executable = "$HOME/hello.sh" jd.number_of_processes = "1" jd.arguments = ["$HELLOWORLD"] jd.environment = ['HELLOWORLD=hello_world'] jd.input_data = ["hi", "ho"] # specify an optinal working directory if sub-job should be executed outside of bigjob sandbox #jd.working_directory = "/tmp" jd.output = "stdout.txt" jd.error = "stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) print "state: " + state if (state == "Failed" or state == "Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def __start_bigjob(self, bj_dict): """ private method - starts a bigjob on the defined resource """ gram_url = bj_dict["resource_url"] logger.debug("start bigjob at: " + gram_url) bj = bigjob(self.coordination_url) if("processes_per_node" not in bj_dict or bj_dict["processes_per_node"] == 'None'): ppn="1" else: ppn=bj_dict["processes_per_node"] walltime = 3600 if ("walltime" in bj_dict): walltime=bj_dict["walltime"] working_directory = None if ("working_directory" in bj_dict): working_directory=bj_dict["working_directory"] bj_filetransfer = None if ("file_transfer" in bj_dict): bj_filetransfer = bj_dict["file_transfer"] bj.start_pilot_job(lrms_url = gram_url, number_nodes = int(bj_dict["number_of_processes"]), queue = bj_dict["queue"], project = bj_dict["project"], working_directory = working_directory, walltime = walltime, processes_per_node = ppn, filetransfers = bj_filetransfer, spmd_variation = bj_dict["spmd_variation"], external_queue = self.coordination_queue, pilot_compute_description = bj_dict["pilot_compute_description"] ) return bj
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue=None # if None default queue is used project=None # if None default allocation is used walltime=10 processes_per_node=4 number_of_processes = 8 workingdirectory= os.path.join(os.getcwd(), "agent") userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "condor://localhost" ########################################################################################## input_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "test.txt") bj_filetransfers = [input_file +" > test.txt"] print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, None, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node, bj_filetransfers) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/cat" jd.number_of_processes = "1" jd.spmd_variation = "single" jd.arguments = ["test.txt"] #jd.working_directory = "" jd.output = "sj-stdout.txt" jd.error = "sj-stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) bj_state = bj.get_state() print "bj state: " + str(bj_state) + " state: " + state if(state=="Failed" or state=="Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
return False """ Test Job Submission via Advert """ if __name__ == "__main__": starttime = time.time() if len(sys.argv) == 2: reconnect_url = sys.argv[1] else: print "Usage: " + sys.executable + " " + __file__ + " <BigJob URL to Reconnect to>" sys.exit(-1) print "Reconnect to Pilot Job/BigJob at: " + reconnect_url bj = bigjob(pilot_url=reconnect_url) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str( bj.get_state() + " Number of SJs: " + str(len(bj.list_subjobs()))) ########################################################################################## # Submit some more subjobs if bj.get_state() != "Done": jobs = [] job_start_times = {} job_states = {} for i in range(0, NUMBER_JOBS): jd = description() jd.executable = "/bin/date" jd.number_of_processes = "1" jd.spmd_variation = "single"
temps.append(temp) ################################################################################## # Start BigJob # Parameter for BigJob # bigjob_agent = os.getcwd() + "/bigjob_agent_launcher.sh" # path to agent # bigjob_agent = "/bin/echo" nodes = BIGJOB_SIZE # number nodes for agent # workingdirectory=os.getcwd() +"/agent" # working directory for agent userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) bjs = [] i = 0 NUMBER_BIGJOBS = int(NUMBER_BIGJOBS) for i in range(0, NUMBER_BIGJOBS): bj = bigjob.bigjob(advert_host) bjs.append(bj) if i == 0: queue = "workq" project = "loni_jhabig10" bigjob_agent = WORK_DIR + "/bigjob_agent_launcher.sh" # path to agent workingdirectory = WORK_DIR + "/agent" # working directory for agent lrms_url = "gram://" + HOST + "/jobmanager-pbs" elif i == 1: queue = "development" project = "TG-MCB090174" bigjob_agent = WORK_DIR1 + "/bigjob_agent_launcher.sh" # path to agent workingdirectory = WORK_DIR1 + "/agent" # working directory for agent lrms_url = "gram://" + REMOTE1 + "/jobmanager-sge" elif i == 2: bigjob_agent = WORK_DIR2 + "/bigjob_agent_launcher.sh" # path to agent
########################################################################################## # make sure you are familiar with the queue structure on futuregrid,ppn, your project id # and the walltime limits on each queue. change accordingly # queue="batch" # Queue to which BigJob has to be submitted, if None, default queue is considered. project=None # Allocation Information. if None, default information is considered walltime=60 # Time in minutes. There are limits on the time you can request processes_per_node=8 # ppn number_of_processes=24 # The total number of processes ( BigJob size), used to run Jobs workingdirectory= os.path.join(os.getcwd(), "agent") # working directory for agent. ########################################################################################## print "Start Pilot Job/BigJob at: " + RESOURCEMGR_URL bj = bigjob(COORDINATION_URL) bj.start_pilot_job( RESOURCEMGR_URL, None, number_of_processes, queue, project, workingdirectory, None, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) ########################################################################################## jobs = [] job_start_times = {}
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue = None # if None default queue is used project = None # if None default allocation is used walltime = 10 processes_per_node = 4 number_of_processes = 8 workingdirectory = os.path.join(os.getcwd(), "agent") # working directory for agent userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ #lrms_url = "fork://localhost" # resource url to run the jobs on localhost lrms_url = "condorg://brgw1.renci.org:2119/jobmanager-pbs" #lrms_url = "ssh://[email protected]" ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job(lrms_url, None, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str( bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/date" jd.number_of_processes = "1" jd.spmd_variation = "single" #jd.arguments = ["match -f bgr1.fa -A 0 -r reads_1.fastq -n 4 -T /tmp/ > bfast.matches.file.bgr.1.bmf"] jd.arguments = [""] #jd.working_directory = "" jd.output = "bfast-stdout.txt" jd.error = "bfast-stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) print "state: " + state if (state == "Failed" or state == "Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue=None # if None default queue is used project=None # if None default allocation is used walltime=10 processes_per_node=4 number_of_processes = 8 workingdirectory=os.path.join(os.getcwd(), "agent") # working directory for agent userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ #lrms_url = "fork://localhost" # resource url to run the jobs on localhost lrms_url = "condorg://brgw1.renci.org:2119/jobmanager-pbs" #lrms_url = "ssh://[email protected]" ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, None, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/date" jd.number_of_processes = "1" jd.spmd_variation = "single" #jd.arguments = ["match -f bgr1.fa -A 0 -r reads_1.fastq -n 4 -T /tmp/ > bfast.matches.file.bgr.1.bmf"] jd.arguments = [""] #jd.working_directory = "" jd.output = "bfast-stdout.txt" jd.error = "bfast-stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) print "state: " + state if(state=="Failed" or state=="Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue=None # if None default queue is used project=None # if None default allocation is used walltime=10 processes_per_node=4 number_of_processes = 8 userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "sge-ssh://lonestar.tacc.teragrid.org" """ To use Globus Online the working directory must be specified using the following conventions """ workingdirectory="go://"+GLOBUS_ONLINE_USER+":"+GLOBUS_ONLINE_PASSWORD+"@globusonline.org?ep=xsede#lonestar4&path=~/bigjob/" ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj_filetransfers = ["go://"+GLOBUS_ONLINE_USER+":"+GLOBUS_ONLINE_PASSWORD+"@globusonline.org?ep=drelu#MacBook&path=" + os.path.dirname(os.path.abspath(__file__)) + "/test.txt > BIGJOB_WORK_DIR"] bj.start_pilot_job( lrms_url, None, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node, bj_filetransfers) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/cat" jd.number_of_processes = "1" jd.spmd_variation = "single" jd.arguments = ["test.txt"] jd.output = "stdout.txt" jd.error = "stderr.txt" jd.file_transfer = ["go://"+GLOBUS_ONLINE_USER+":"+GLOBUS_ONLINE_PASSWORD+"@globusonline.org?ep=drelu#MacBook&path=" + os.path.dirname(os.path.abspath(__file__)) + "/test.txt > SUBJOB_WORK_DIR"] sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) print "state: " + state if(state=="Failed" or state=="Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue=None # if None default queue is used project=None # if None default allocation is used walltime=600 processes_per_node=12 number_of_processes=24 workingdirectory="/lustre/scratch/aluckow/agent" # working directory for agent userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "xt5torque://localhost" # resource url to run the jobs on localhost ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/hostname" jd.number_of_processes = "1" jd.spmd_variation = "single" jd.arguments = [""] #jd.working_directory = "/tmp" jd.output = "stdout.txt" jd.error = "stderr.txt" sjs = [] for i in range(0,24): sj = subjob() sj.submit_job(bj.pilot_url, jd) sjs.append(sj) ######################################### # busy wait for completion while 1: for idx, sj in enumerate(sjs): state = str(sj.get_state()) print "sj: %d state: %s"%(idx,state) if(state=="Failed" or state=="Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
########################################################################################## # make sure you are familiar with the queue structure on futuregrid,ppn, your project id # and the walltime limits on each queue. change accordingly # RESOURCEMGR_URL = "pbs-ssh://username@Target_machines_host_name" queue"normal" # Queue information to which BigJob has to be submitted. if None, default queue is considered. project=None # Allocation information walltime=30 # Time in minutes. There are limits on the time you can request. processes_per_node=8 # ppn number_of_processes=12 # The total number of processes ( BigJob size), used to run SubJobs workingdirectory= os.path.join(os.getcwd(), "agent") # working directory for agent. ########################################################################################## print "Start Pilot Job/BigJob at: " + RESOURCEMGR_URL bj1 = bigjob(COORDINATION_URL) bj1.start_pilot_job( RESOURCEMGR_URL, None, number_of_processes, queue, project, workingdirectory, None, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj1.pilot_url + " State: " + str(bj1.get_state()) ########################################################################################## #Start 2nd BigJob ##########################################################################################
return False """ Test Job Submission via Advert """ if __name__ == "__main__": starttime=time.time() if len(sys.argv)==2: reconnect_url=sys.argv[1] else: print "Usage: " + sys.executable + " " + __file__ + " <BigJob URL to Reconnect to>" sys.exit(-1) print "Reconnect to Pilot Job/BigJob at: " + reconnect_url bj = bigjob(COORDINATION_URL, reconnect_url) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state() + " Number of SJs: " + str(len(bj.list_subjobs())) ) ########################################################################################## # Submit some more subjobs if bj.get_state()!="Done": jobs = [] job_start_times = {} job_states = {} for i in range(0, NUMBER_JOBS): jd = description() jd.executable = "/bin/date" jd.number_of_processes = "1" jd.spmd_variation = "single"
The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "fork://localhost" ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.get_url() + " State: " + str(bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jobs = [] job_start_times = {}
t = t + 10 temps.append(temp) ################################################################################## # Start BigJob # Parameter for BigJob bigjob_agent = os.getcwd() + "/bigjob_agent_launcher.sh" # path to agent #bigjob_agent = "/bin/echo" nodes = BIGJOB_SIZE # number nodes for agent workingdirectory = os.getcwd() + "/agent" # working directory for agent userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) bjs = [] i = 0 for i in range(0, NUMBER_BIGJOBS): bj = bigjob.bigjob(advert_host) bjs.append(bj) if (i == 0): lrms_url = "gram://" + HOST + "/jobmanager-pbs" elif (i == 1): lrms_url = "gram://" + REMOTE1 + "/jobmanager-pbs" elif (i == 2): lrms_url = "gram://" + REMOTE2 + "/jobmanager-pbs" else: lrms_url = "gram://" + REMOTE3 + "/jobmanager-pbs" bjs[i].start_pilot_job(lrms_url, bigjob_agent, nodes, None, None, workingdirectory, userproxy, 150) print "Start Pilot Job/BigJob: " + bigjob_agent + " at: " + lrms_url print "Pilot Job/BigJob URL: " + bjs[i].pilot_url + " State: " + str( bjs[i].get_state()) print "####################" + time.asctime(time.localtime(time.time(
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue="normal" # if None default queue is used project=None # if None default allocation is used walltime=10 processes_per_node=4 number_of_processes = 8 #workingdirectory=os.path.join(os.getcwd(), "agent") # working directory for agent workingdirectory="agent" userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "fork://localhost" # resource url to run the jobs on localhost #lrms_url = "sge://localhost" # resource url to run the jobs on localhost #lrms_url = "ssh://localhost" # resource url to run the jobs on localhost ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/echo" #jd.executable = "$HOME/hello.sh" jd.number_of_processes = "1" jd.arguments = ["$HELLOWORLD"] jd.environment = ['HELLOWORLD=hello_world'] #jd.spmd_variation = "mpi" # specify an optinal working directory if sub-job should be executed outside of bigjob sandbox #jd.working_directory = "/tmp" jd.output = "stdout.txt" jd.error = "stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) print "state: " + state if(state=="Failed" or state=="Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue = "normal" # if None default queue is used project = None # if None default allocation is used walltime = 10 processes_per_node = 4 number_of_processes = 8 #workingdirectory=os.path.join(os.getcwd(), "agent") # working directory for agent workingdirectory = "agent" userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "ssh://boskop" #lrms_url = "sge://localhost" #lrms_url = "fork://localhost" ########################################################################################## # for i in range(99999): # js = saga.job.Service (lrms_url) # j = js.run_job ("/bin/sleep 1000") # print "%4d: %s" % (i, j.state) for i in range(99999): print i print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job(lrms_url, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str( bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/echo" #jd.executable = "$HOME/hello.sh" jd.number_of_processes = "1" jd.arguments = ["$HELLOWORLD"] jd.environment = ['HELLOWORLD=hello_world'] #jd.spmd_variation = "mpi" # specify an optinal working directory if sub-job should be executed outside of bigjob sandbox #jd.working_directory = "/tmp" jd.output = "stdout.txt" jd.error = "stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) print "state: " + state if (state == "Failed" or state == "Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue = None # if None default queue is used project = None # if None default allocation is used walltime = 10 processes_per_node = 4 number_of_processes = 8 workingdirectory = os.path.join(os.getcwd(), "agent") userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "condor://localhost" ########################################################################################## input_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "test.txt") bj_filetransfers = [input_file + " > test.txt"] print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job(lrms_url, None, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node, bj_filetransfers) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str( bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/cat" jd.number_of_processes = "1" jd.spmd_variation = "single" jd.arguments = ["test.txt"] #jd.working_directory = "" jd.output = "sj-stdout.txt" jd.error = "sj-stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) bj_state = bj.get_state() print "bj state: " + str(bj_state) + " state: " + state if (state == "Failed" or state == "Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
return False """ Test Job Submission via Advert """ if __name__ == "__main__": starttime=time.time() if len(sys.argv)==2: reconnect_url=sys.argv[1] else: print "Usage: " + sys.executable + " " + __file__ + " <BigJob URL to Reconnect to>" sys.exit(-1) print "Reconnect to Pilot Job/BigJob at: " + reconnect_url bj = bigjob(pilot_url=reconnect_url) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state() + " Number of SJs: " + str(len(bj.list_subjobs())) ) ########################################################################################## # Submit some more subjobs if bj.get_state()!="Done": jobs = [] job_start_times = {} job_states = {} for i in range(0, NUMBER_JOBS): jd = description() jd.executable = "/bin/date" jd.number_of_processes = "1" jd.spmd_variation = "single"
# ################################################################################### # TG/LONI Pilot Job # Parameter for BigJob re_agent = "/home/luckow/src/bigjob/bigjob_agent_launcher.sh" # path to agent nodes = 8 # number nodes for agent lrms_url = "gram://poseidon1.loni.org/jobmanager-pbs" # resource url #lrms_url = "gram://qb1.loni.org/jobmanager-fork" # resource url project = "" #allocation queue = "checkpt" # queue (PBS) workingdirectory="/home/luckow/" # working directory userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) bj_tg = bigjob.bigjob(advert_host) if NUMBER_JOBS_GRID>0: print "Start Pilot Job/BigJob in TG at: " + lrms_url bj_tg.start_pilot_job(lrms_url, re_agent, nodes, queue, project, workingdirectory, userproxy, "20") ########################################################################################## # Submit SubJob through BigJob # NAMD command: $NAMD_HOME/charmrun ++verbose ++remote-shell ssh ++nodelist nodefile +p4 /usr/local/namd2/namd2 NPT.conf # working directory: $HOME/run
def load_test(bfast_jobs, run_id=0): starttime=time.time() workingdirectory=os.getcwd() +"/agent" # working directory for agent print "\n**************************************************************************************************************************************************\n" print "START TEST %d Specification: %s"%(run_id, str(bfast_jobs)) print "\n**************************************************************************************************************************************************\n" for i in bfast_jobs.keys(): resource = bfast_jobs[i] number_nodes = resource["number_nodes"] number_cores_per_node= resource["number_cores_per_node"] number_jobs = resource["number_subjobs"] lrms_url = resource["lrms_url"] workingdirectory = resource["working_directory"] print ("BJ: %s- #nodes:%d, #cores/node:%d, #jobs: %d, coordination-url:%s, lrms-url:%s"% (lrms_url, number_nodes, number_cores_per_node, number_jobs, COORDINATION_URL, lrms_url)) # start pilot job (bigjob_agent) print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job(lrms_url=lrms_url, number_nodes=number_nodes, processes_per_node=number_cores_per_node, working_directory=workingdirectory ) resource["bigjob"]=bj resource["queueing_time"]=None resource["subjobs"]=[] subjob_submission_time = None all_running = False while all_running == False: all_running = get_bj_states(bfast_jobs, starttime) if all_running == False: time.sleep(1) ########################################################################################## # Submit SubJob through BigJob job_start_times = {} job_states = {} jobs = [] total_number_of_jobs=0 for i in bfast_jobs.keys(): resource = bfast_jobs[i] bj = resource["bigjob"] subjobs = resource["subjobs"] if i == "kraken": jd = saga.job.description() #jd.executable = "/bin/date" jd.executable = "time" jd.number_of_processes = "2" jd.spmd_variation = "single" jd.arguments = [""] jd.arguments = [resource["bfast_exe"], "match", "-f", resource["bfast_ref_genome"] , "-A 1", "-r", resource["bfast_reads"], "-n" ,"1" , "-T" , resource["bfast_tmp_dir"] ] jd.environment=["NUMBER_SUBJOBS="+str(resource["number_aprun_subjobs"])] jd.output = "bfast-stdout.txt" jd.error = "bfast-stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) total_number_of_jobs = total_number_of_jobs + 1 subjobs.append(sj) jobs.append(sj) job_start_times[sj]=time.time() job_states[sj] = sj.get_state() else: for i in range(0, resource["number_subjobs"]): jd = saga.job.description() #jd.executable = "/bin/date" jd.executable = "time" jd.number_of_processes = "1" jd.spmd_variation = "single" jd.arguments = [""] jd.arguments = [resource["bfast_exe"], "match", "-f", resource["bfast_ref_genome"] , "-A 1", "-r", resource["bfast_reads"], "-n" ,"1" , "-T" , resource["bfast_tmp_dir"] ] if resource.has_key("bfast_library_path"): jd.environment=["LD_LIBRARY_PATH="+resource["bfast_library_path"]] jd.output = "bfast-stdout.txt" jd.error = "bfast-stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) total_number_of_jobs = total_number_of_jobs + 1 subjobs.append(sj) jobs.append(sj) job_start_times[sj]=time.time() job_states[sj] = sj.get_state() all_running = get_bj_states(bfast_jobs, starttime) subjob_submission_time = time.time()-starttime # busy wait for completion total_number_subjobs = 0 for i in bfast_jobs.keys(): resource = bfast_jobs[i] number_sj = resource["number_subjobs"] total_number_subjobs = total_number_subjobs + number_sj while 1: all_running = get_bj_states(bfast_jobs, starttime) for i in bfast_jobs.keys(): resource = bfast_jobs[i] bj = resource["bigjob"] subjobs = resource["subjobs"] number_subjobs_in_bigjob = resource["number_subjobs"] finish_counter=0 result_map = {"Failed":0, "Done":0} for i in subjobs: # iterate through sub-jobs old_state = job_states[i] state = i.get_state() if result_map.has_key(state)==False: result_map[state] = 1 else: result_map[state] = result_map[state] + 1 if old_state != state: print "Job " + str(i) + " changed from: " + old_state + " to " + state if old_state != state and has_finished(state)==True: job_runtime = time.time()-job_start_times[i] print "Job: " + str(i) + " Runtime: " + str(job_runtime) + " s." if has_finished(state)==True: finish_counter = finish_counter + 1 job_states[i]=state print "BJ: " + str(bj) + " Result: " + str(result_map) if number_subjobs_in_bigjob == result_map["Done"]+result_map["Failed"] and resource.has_key("completion_time")==False: bj_completion_time = time.time() - starttime resource["completion_time"] = bj_completion_time print "BJ: " + str(bj) + " Result: " + str(result_map) + " Time: " + str(bj_completion_time) print "BJ: %s State: %s; %d/%d jobs finished"%(bj, bj.get_state_detail(), finish_counter, number_subjobs_in_bigjob) # check whether all BJs are finished #finished_bj = 0 #for i in bfast_jobs.keys(): # resource = bfast_jobs[i] # if resource.has_key("completion_time"): # finished_bj = finished_bj + 1 #if finished_bj == len(bfast_jobs.keys()): # break print("%d/%d finished"%(finish_counter, total_number_subjobs)) if finish_counter == total_number_subjobs: break time.sleep(2) runtime = time.time()-starttime results = "" for i in bfast_jobs.keys(): resource = bfast_jobs[i] bj = resource["bigjob"] number_nodes = resource["number_nodes"] number_cores_per_node = resource["number_cores_per_node"] number_jobs = resource["number_subjobs"] bj_runtime = resource["completion_time"] queueing_time = resource["queueing_time"] lrms_url = resource["lrms_url"] print ("Run\tBJ\t#Nodes\t#cores/node\t#jobs\tQueuing Time\tBJ Runtime\tTotal Runtime\tCoordination URL\tLRMS URL") result_tuple = (run_id, str(bj), number_nodes, number_cores_per_node, number_jobs, str(queueing_time), str(bj_runtime), str(runtime), COORDINATION_URL, lrms_url) results = results + ("%d,%s,%d,%d,%d,%s,%s,%s,%s,%s\n"%(result_tuple)) # print results result_tab = ("%d\t%s\t%d\t%d\t%d\t%s\t%s\t%s,\t%s\t%s"%(result_tuple)) print result_tab # Cleanup - stop BigJob for i in bfast_jobs.keys(): resource = bfast_jobs[i] bj = resource["bigjob"] bj.cancel() # hack: delete manually pbs jobs of user #os.system("qstat -u `whoami` | grep -o ^[0-9]* |xargs qdel") return results