def __get_pilot_url(self, job_url): end = job_url.index(":jobs") # Make sure that DB type is appended surl = SAGAUrl(job_url) query = surl.query pilot_url = job_url[:end] if query != None and query != "": pilot_url = pilot_url + "?" + query return pilot_url
def __exists_file(cls, url): """ return True if file at url exists. Otherwise False """ file_url = SAGAUrl(url) if file_url.host == "": if os.path.exists(str(file_url)): return True else: return False elif file_url.host == "localhost": if os.path.exists(file_url.path): return True else: return False else: return True
def __parse_subjob_url(self, subjob_url): #pdb.set_trace() subjob_saga_url = SAGAUrl(subjob_url) dbtype = subjob_saga_url.query coordination = subjob_url[:subjob_url.index("bigjob")] if dbtype != None: coordination = os.path.join(coordination, "?" + dbtype) sj_url = subjob_saga_url.path[1:] # sj_url = subjob_url[subjob_url.find("bigjob"):] # if sj_url.find("/") > 0 or dbtype!=None or dbtype!="": # comp = sj_url.split("/") # sj_url = comp[0] # if comp[1].find("dbtype")>0: # dbtype=comp[1][comp[1].find("dbtype"):] logger.debug("Parsed URL - Coordination: %s Pilot: %s" % (coordination, sj_url)) return coordination, sj_url
def __parse_url(self, url): try: surl = SAGAUrl(url) host = surl.host port = surl.port username = surl.username password = surl.password query = surl.query if query != None and query.endswith("/"): query = query[:-1] scheme = "%s://" % surl.scheme except: """ Fallback URL parser based on Python urlparse library """ logger.error("URL %s could not be parsed" % (url)) traceback.print_exc(file=sys.stderr) result = urlparse.urlparse(url) logger.debug("Result: " + str(result)) host = result.hostname #host = None port = result.port username = result.username password = result.password scheme = "%s://" % result.scheme if host == None: logger.debug("Python 2.6 fallback") if url.find("/", len(scheme)) > 0: host = url[len(scheme):url.find("/", len(scheme))] else: host = url[len(scheme):] if host.find(":") > 1: logger.debug(host) comp = host.split(":") host = comp[0] port = int(comp[1]) if url.find("?") > 0: query = url[url.find("?") + 1:] else: query = None logger.debug("%s %s %s" % (scheme, host, port)) return scheme, username, password, host, port, query
def wait(self): """ Waits for completion of all sub-jobs """ while 1: jobs = self.coordination.get_jobs_of_pilot(self.pilot_url) finish_counter = 0 result_map = {} for i in jobs: # parse job id out of sj url surl = SAGAUrl(i) state = self.coordination.get_job_state(surl.path) #state = job_detail["state"] if result_map.has_key(state) == False: result_map[state] = 1 else: result_map[state] = result_map[state] + 1 if self.__has_finished(state) == True: finish_counter = finish_counter + 1 logger.debug("Total Jobs: %s States: %s" % (len(jobs), str(result_map))) if finish_counter == len(jobs): break time.sleep(2)
def __parse_pilot_url(self, pilot_url): #pdb.set_trace() pilot_saga_url = SAGAUrl(pilot_url) dbtype = pilot_saga_url.query coordination = pilot_url[:pilot_url.index("bigjob")] if dbtype != None: coordination = os.path.join(coordination, "?" + dbtype) pilot_url = pilot_saga_url.path[1:] #dbtype = None #coordination = pilot_url[:pilot_url.index("bigjob")] #pilot_url = pilot_url[pilot_url.find("bigjob"):] #if pilot_url.find("/") > 0: # comp = pilot_url.split("/") # pilot_url = comp[0] # if comp[1].find("dbtype")>0: # dbtype=comp[1][comp[1].find("dbtype"):] #if dbtype!=None: # coordination = os.path.join(coordination, "?"+dbtype) logger.debug("Parsed URL - Coordination: %s Pilot: %s" % (coordination, pilot_url)) return coordination, pilot_url
def start_pilot_job(self, lrms_url, number_nodes=1, queue=None, project=None, working_directory=None, userproxy=None, walltime=None, processes_per_node=1, filetransfers=None, external_queue="", pilot_compute_description=None): """ Start a batch job (using SAGA Job API) at resource manager. Currently, the following resource manager are supported: fork://localhost/ (Default Job Adaptor gram://qb1.loni.org/jobmanager-pbs (Globus Adaptor) pbspro://localhost (PBS Pro Adaptor) """ if self.job != None: raise BigJobError( "One BigJob already active. Please stop BigJob first.") return ############################################################################## # initialization of coordination and communication subsystem # Communication & Coordination initialization lrms_saga_url = SAGAUrl(lrms_url) self.url = lrms_saga_url self.pilot_url = self.app_url + ":" + lrms_saga_url.host self.number_nodes = int(number_nodes) * int(processes_per_node) # Store references to BJ in global dict _pilot_url_dict[self.pilot_url] = self _pilot_url_dict[external_queue] = self logger.debug("create pilot job entry on backend server: " + self.pilot_url) self.coordination.set_pilot_state(self.pilot_url, str(Unknown), False) self.coordination.set_pilot_description(self.pilot_url, filetransfers) logger.debug("set pilot state to: " + str(Unknown)) ############################################################################## # Create Job Service (Default: SAGA Job Service, alternative Job Services supported) self.js = None if lrms_saga_url.scheme == "gce+ssh": self.js = GCEService(lrms_saga_url, pilot_compute_description) elif lrms_saga_url.scheme=="ec2+ssh" or lrms_saga_url.scheme=="euca+ssh" \ or lrms_saga_url.scheme=="nova+ssh": self.js = EC2Service(lrms_saga_url, pilot_compute_description) else: self.js = SAGAJobService(lrms_saga_url) ############################################################################## # create job description jd = SAGAJobDescription() # Attempt to create working directory (e.g. in local scenario) if working_directory != None: if not os.path.isdir(working_directory) \ and (lrms_saga_url.scheme.startswith("fork") or lrms_saga_url.scheme.startswith("condor")) \ and working_directory.startswith("go:")==False: os.mkdir(working_directory) self.working_directory = working_directory else: # if no working dir is set assume use home directory # will fail if home directory is not the same on remote machine # but this is just a guess to avoid failing #self.working_directory = os.path.expanduser("~") self.working_directory = "" if queue != None: jd.queue = queue if project != None: jd.project = project if walltime != None: if is_bliss: jd.wall_time_limit = int(walltime) else: jd.wall_time_limit = str(walltime) ############################################################################## # File Management and Stage-In # Determine whether target machine use gsissh or ssh to logon. # logger.debug("Detect launch method for: " + lrms_saga_url.host) # self.launch_method = self.__get_launch_method(lrms_saga_url.host,lrms_saga_url.username) self.bigjob_working_directory_url = "" if lrms_saga_url.scheme.startswith("gce") or lrms_saga_url.scheme.startswith("ec2")\ or lrms_saga_url.scheme.startswith("euca") or lrms_saga_url.scheme.startswith("nova"): logger.debug( "File Staging for Cloud Instances currently not supported.") elif lrms_saga_url.scheme.startswith("condor") == True: logger.debug("Using Condor file staging") else: # build target url for working directory # this will also create the remote directory for the BJ # Fallback if working directory is not a valid URL if not (self.working_directory.startswith("go:") or self.working_directory.startswith("ssh://")): if lrms_saga_url.username != None and lrms_saga_url.username != "": self.bigjob_working_directory_url = "ssh://" + lrms_saga_url.username + "@" + lrms_saga_url.host + self.__get_bigjob_working_dir( ) else: self.bigjob_working_directory_url = "ssh://" + lrms_saga_url.host + self.__get_bigjob_working_dir( ) elif self.working_directory.startswith("go:"): self.bigjob_working_directory_url = os.path.join( self.working_directory, self.uuid) else: # working directory is a valid file staging URL self.bigjob_working_directory_url = self.working_directory # initialize file manager that takes care of file movement and directory creation if self.__filemanager == None: self.__initialize_pilot_data( self.bigjob_working_directory_url) # determines the url if self.__filemanager != None and not self.working_directory.startswith( "/"): self.working_directory = self.__filemanager.get_path( self.bigjob_working_directory_url) # determine working directory of bigjob # if a remote sandbox can be created via ssh => create a own dir for each bj job id # otherwise use specified working directory logger.debug("BigJob working directory: %s" % self.bigjob_working_directory_url) if self.__filemanager != None and self.__filemanager.create_remote_directory( self.bigjob_working_directory_url) == True: self.working_directory = self.__get_bigjob_working_dir() self.__stage_files(filetransfers, self.bigjob_working_directory_url) else: logger.warn("No file staging adaptor found.") logger.debug("BJ Working Directory: %s", self.working_directory) if lrms_saga_url.scheme.startswith("condor") == False: jd.working_directory = self.working_directory else: jd.working_directory = "" ############################################################################## # Create and process BJ bootstrap script bootstrap_script = self.__generate_bootstrap_script( self.coordination.get_address(), self.pilot_url, # Queue 1 used by this BJ object external_queue # Queue 2 used by Pilot Compute Service # or another external scheduler ) logger.debug("Adaptor specific modifications: " + str(lrms_saga_url.scheme)) if is_bliss: bootstrap_script = self.__escape_bliss(bootstrap_script) else: if lrms_saga_url.scheme == "gram": bootstrap_script = self.__escape_rsl(bootstrap_script) elif lrms_saga_url.scheme == "pbspro" or lrms_saga_url.scheme == "xt5torque" or lrms_saga_url.scheme == "torque": bootstrap_script = self.__escape_pbs(bootstrap_script) elif lrms_saga_url.scheme == "ssh": bootstrap_script = self.__escape_ssh(bootstrap_script) logger.debug(bootstrap_script) # Define Agent Executable in Job description # in Condor case bootstrap script is staged # (Python app cannot be passed inline in Condor job description) if lrms_saga_url.scheme.startswith("condor") == True: condor_bootstrap_filename = os.path.join( "/tmp", "bootstrap-" + str(self.uuid)) condor_bootstrap_file = open(condor_bootstrap_filename, "w") condor_bootstrap_file.write(bootstrap_script) condor_bootstrap_file.close() logger.debug("Using Condor - bootstrap file: " + condor_bootstrap_filename) jd.executable = "/usr/bin/env" jd.arguments = [ "python", os.path.basename(condor_bootstrap_filename) ] bj_file_transfers = [] file_transfer_spec = condor_bootstrap_filename + " > " + os.path.basename( condor_bootstrap_filename) bj_file_transfers.append(file_transfer_spec) output_file_name = "output-" + str(self.uuid) + ".tar.gz" output_file_transfer_spec = os.path.join( self.working_directory, output_file_name) + " < " + output_file_name #output_file_transfer_spec = os.path.join(self.working_directory, "output.tar.gz") +" < output.tar.gz" logger.debug("Output transfer: " + output_file_transfer_spec) bj_file_transfers.append(output_file_transfer_spec) if filetransfers != None: for t in filetransfers: bj_file_transfers.append(t) logger.debug("Condor file transfers: " + str(bj_file_transfers)) jd.file_transfer = bj_file_transfers else: if is_bliss: jd.total_cpu_count = int(number_nodes) else: jd.number_of_processes = str(number_nodes) jd.processes_per_host = str(processes_per_node) jd.spmd_variation = "single" jd.arguments = ["python", "-c", bootstrap_script] jd.executable = "/usr/bin/env" logger.debug("Working directory: " + jd.working_directory) jd.output = os.path.join(self.working_directory, "stdout-" + self.uuid + "-agent.txt") jd.error = os.path.join(self.working_directory, "stderr-" + self.uuid + "-agent.txt") ############################################################################## # Create and submit pilot job to job service logger.debug("Creating pilot job with description: %s" % str(jd)) self.job = self.js.create_job(jd) logger.debug("Submit pilot job to: " + str(lrms_saga_url)) self.job.run() return self.pilot_url
def get_base_url(cls, application_id): surl = SAGAUrl(cls.BASE_URL) base_url = surl.scheme + "://" + surl.host + "/" + application_id + "/" logger.debug(base_url) return base_url
def __remove_dbtype(cls, url): surl = SAGAUrl(url) return str(surl)