def start_pilot_job(self, lrms_url, bigjob_agent_executable=None, number_nodes=1, queue=None, project=None, working_directory=None, userproxy=None, walltime=None, processes_per_node=1, filetransfers=None): """ Start a batch job (using SAGA Job API) at resource manager. Currently, the following resource manager are supported: fork://localhost/ (Default Job Adaptor gram://qb1.loni.org/jobmanager-pbs (Globus Adaptor) pbspro://localhost (PBS Prop Adaptor) """ if self.job != None: raise BigJobError("One BigJob already active. Please stop BigJob first.") return ############################################################################## # initialization of coordination and communication subsystem # Communication & Coordination initialization lrms_saga_url = saga.url(lrms_url) self.pilot_url = self.app_url + ":" + lrms_saga_url.host pilot_url_dict[self.pilot_url]=self logger.debug("create pilot job entry on backend server: " + self.pilot_url) self.coordination.set_pilot_state(self.pilot_url, str(Unknown), False) logger.debug("set pilot state to: " + str(Unknown)) ############################################################################## self.number_nodes=int(number_nodes) # create job description jd = saga.job.description() logger.debug("Adaptor specific modifications: " + str(lrms_saga_url.scheme)) if lrms_saga_url.scheme == "condorg": jd.arguments = [ "-a", self.coordination.get_address(), "-b",self.pilot_url] logger.debug("\n\n-a", self.coordination.get_address(),"-b", self.pilot_url) agent_exe = os.path.abspath(os.path.join(os.getcwd(),"..","bootstrap","bigjob-condor-bootstrap.py")) logger.debug(agent_exe) jd.executable = agent_exe else: bootstrap_script = self.generate_bootstrap_script(self.coordination.get_address(), self.pilot_url) if lrms_saga_url.scheme == "gram": bootstrap_script = self.escape_rsl(bootstrap_script) elif lrms_saga_url.scheme == "pbspro": bootstrap_script = self.escape_pbs(bootstrap_script) elif lrms_saga_url.scheme == "ssh": bootstrap_script = self.escape_ssh(bootstrap_script) ############ submit pbs script which launches bigjob agent using ssh adaptors########## elif lrms_saga_url.scheme == "pbs-ssh": # change the url scheme ssh to use ssh adaptors to launch job bootstrap_script = self.escape_ssh(bootstrap_script) ### convert walltime in minutes to PBS representation of time ### hrs=walltime/60 minu=walltime%60 walltimepbs=""+str(hrs)+":"+str(minu)+":00" if number_nodes%processes_per_node == 0: number_nodes = number_nodes/processes_per_node else: number_nodes = ( number_nodes/processes_per_node) + 1 pbssshj = pbsssh(bootstrap_script,lrms_saga_url, walltimepbs,number_nodes,processes_per_node,userproxy,working_directory) self.job = pbssshj self.job.run() return elif is_bliss: bootstrap_script = self.escape_bliss(bootstrap_script) #logger.debug(bootstrap_script) if is_bliss==False: jd.number_of_processes = str(number_nodes) jd.processes_per_host=str(processes_per_node) else: jd.TotalCPUCount=str(int(number_nodes)*int(processes_per_node)) jd.spmd_variation = "single" #jd.arguments = [bigjob_agent_executable, self.coordination.get_address(), self.pilot_url] jd.arguments = ["-c", bootstrap_script] jd.executable = "python" if queue != None: jd.queue = queue if project !=None: jd.job_project = [project] if walltime!=None: jd.wall_time_limit=str(walltime) # XXX Isn't the working directory about the remote site? if working_directory != None: if not os.path.isdir(working_directory) and lrms_saga_url.scheme=="fork": os.mkdir(working_directory) self.working_directory = working_directory else: self.working_directory = os.path.expanduser("~") jd.working_directory = self.working_directory logger.debug("Working directory: " + jd.working_directory) jd.output = os.path.join(self.__get_bigjob_working_dir(), "stdout-bigjob_agent.txt") jd.error = os.path.join(self.__get_bigjob_working_dir(),"stderr-bigjob_agent.txt") # Stage BJ Input files # build target url bigjob_working_directory_url = "ssh://" + lrms_saga_url.host + self.__get_bigjob_working_dir() self.__stage_files(filetransfers, bigjob_working_directory_url) # Submit job js = None if userproxy != None and userproxy != '': s = saga.session() os.environ["X509_USER_PROXY"]=userproxy ctx = saga.context("x509") ctx.set_attribute ("UserProxy", userproxy) s.add_context(ctx) logger.debug("use proxy: " + userproxy) js = saga.job.service(s, lrms_saga_url) else: logger.debug("use standard proxy") js = saga.job.service(lrms_saga_url) self.job = js.create_job(jd) logger.debug("Submit pilot job to: " + str(lrms_saga_url)) self.job.run()
def start_pilot_job(self, lrms_url, bigjob_agent_executable=None, number_nodes=1, queue=None, project=None, working_directory=None, userproxy=None, walltime=None, processes_per_node=1, filetransfers=None): """ Start a batch job (using SAGA Job API) at resource manager. Currently, the following resource manager are supported: fork://localhost/ (Default Job Adaptor gram://qb1.loni.org/jobmanager-pbs (Globus Adaptor) pbspro://localhost (PBS Prop Adaptor) """ if self.job != None: raise BigJobError("One BigJob already active. Please stop BigJob first.") return ############################################################################## # initialization of coordination and communication subsystem # Communication & Coordination initialization lrms_saga_url = saga.url(lrms_url) self.pilot_url = self.app_url + ":" + lrms_saga_url.host pilot_url_dict[self.pilot_url]=self logger.debug("create pilot job entry on backend server: " + self.pilot_url) self.coordination.set_pilot_state(self.pilot_url, str(Unknown), False) logger.debug("set pilot state to: " + str(Unknown)) ############################################################################## self.number_nodes=int(number_nodes) # create job description jd = saga.job.description() # XXX Isn't the working directory about the remote site? # Yes, it is: This is to make sure that if fork if working_directory != None: if not os.path.isdir(working_directory) and lrms_saga_url.scheme=="fork": os.mkdir(working_directory) self.working_directory = working_directory else: # if no working dir is set assume use home directory # will fail if home directory is not the same on remote machine # but this is just a guess to avoid failing self.working_directory = os.path.expanduser("~") # Stage BJ Input files # build target url # this will also create the remote directory for the BJ if lrms_saga_url.username!=None and lrms_saga_url.username!="": bigjob_working_directory_url = "ssh://" + lrms_saga_url.username + "@" + lrms_saga_url.host + self.__get_bigjob_working_dir() else: bigjob_working_directory_url = "ssh://" + lrms_saga_url.host + self.__get_bigjob_working_dir() # determine working directory of bigjob # if a remote sandbox can be created via ssh => create a own dir for each bj job id # otherwise use specified working directory if self.__create_remote_directory(bigjob_working_directory_url)==True: self.working_directory = self.__get_bigjob_working_dir() self.__stage_files(filetransfers, bigjob_working_directory_url) else: logger.warn("For file staging. SSH (incl. password-less authentication is required.") logger.debug("BJ Working Directory: %s", self.working_directory) logger.debug("Adaptor specific modifications: " + str(lrms_saga_url.scheme)) if lrms_saga_url.scheme == "condorg": jd.arguments = [ self.coordination.get_address(), self.pilot_url] agent_exe = os.path.abspath(os.path.join(os.path.dirname(__file__),"..","bootstrap","bigjob-condor-bootstrap.py")) logger.debug("agent_exe",agent_exe) jd.executable = agent_exe else: bootstrap_script = self.generate_bootstrap_script(self.coordination.get_address(), self.pilot_url) if lrms_saga_url.scheme == "gram": bootstrap_script = self.escape_rsl(bootstrap_script) elif lrms_saga_url.scheme == "pbspro" or lrms_saga_url.scheme=="xt5torque" or lrms_saga_url.scheme=="torque": bootstrap_script = self.escape_pbs(bootstrap_script) elif lrms_saga_url.scheme == "ssh": bootstrap_script = self.escape_ssh(bootstrap_script) ############ submit pbs script which launches bigjob agent using ssh adaptors########## elif lrms_saga_url.scheme == "pbs-ssh": bootstrap_script = self.escape_ssh(bootstrap_script) # PBS specific BJ plugin pbssshj = pbsssh(bootstrap_script, lrms_saga_url, walltime, number_nodes, processes_per_node, userproxy, self.working_directory, self.working_directory) self.job = pbssshj self.job.run() return ############ submit sge script which launches bigjob agent using ssh adaptors########## elif lrms_saga_url.scheme == "sge-ssh": bootstrap_script = self.escape_ssh(bootstrap_script) # PBS specific BJ plugin sgesshj = sgessh(bootstrap_script, lrms_saga_url, walltime, number_nodes, processes_per_node, userproxy, project, queue, self.working_directory, self.working_directory) self.job = sgesshj self.job.run() return elif is_bliss: bootstrap_script = self.escape_bliss(bootstrap_script) #logger.debug(bootstrap_script) if is_bliss==False: jd.number_of_processes = str(number_nodes) jd.processes_per_host=str(processes_per_node) else: jd.TotalCPUCount=str(int(number_nodes)*int(processes_per_node)) jd.spmd_variation = "single" #jd.arguments = [bigjob_agent_executable, self.coordination.get_address(), self.pilot_url] jd.arguments = ["python", "-c", bootstrap_script] jd.executable = "/usr/bin/env" if queue != None: jd.queue = queue if project !=None: jd.job_project = [project] if walltime!=None: jd.wall_time_limit=str(walltime) jd.working_directory = self.working_directory logger.debug("Working directory: " + jd.working_directory) jd.output = os.path.join(self.working_directory, "stdout-bigjob_agent.txt") jd.error = os.path.join(self.working_directory,"stderr-bigjob_agent.txt") # Submit job js = None if userproxy != None and userproxy != '': s = saga.session() os.environ["X509_USER_PROXY"]=userproxy ctx = saga.context("x509") ctx.set_attribute ("UserProxy", userproxy) s.add_context(ctx) logger.debug("use proxy: " + userproxy) js = saga.job.service(s, lrms_saga_url) else: logger.debug("use standard proxy") js = saga.job.service(lrms_saga_url) logger.debug("Creating pilot job with description: %s" % str(jd)) self.job = js.create_job(jd) logger.debug("Submit pilot job to: " + str(lrms_saga_url)) self.job.run() return self.pilot_url