def monitor_jobs(self): """Monitor running processes. """ #pdb.set_trace() logger.debug("Monitor jobs - # current jobs: %d"%len(self.jobs)) for i in self.jobs: if self.processes.has_key(i): # only if job has already been starteds p = self.processes[i] p_state = p.poll() logger.debug(self.print_job(i) + " state: " + str(p_state) + " return code: " + str(p.returncode)) if (p_state != None and (p_state==0 or p_state==255)): logger.debug("Job successful: " + self.print_job(i)) self.coordination.set_job_state(i, str(bigjob.state.Done)) #i.set_attribute("state", str(saga.job.Done)) self.free_nodes(i) del self.processes[i] elif p_state!=0 and p_state!=255 and p_state != None: logger.debug(self.print_job(i) + " failed. ") # do not free nodes => very likely the job will fail on these nodes # self.free_nodes(i) #if self.restarted.has_key(i)==False: # logger.debug("Try to restart job " + self.print_job(i)) # self.restarted[i]=True # self.execute_job(i) #else: logger.debug("Job failed " + self.print_job(i)) self.coordination.set_job_state(i, str(bigjob.state.Failed)) self.free_nodes(i) del self.processes[i]
def __get_launch_method(self, requested_method): """ returns desired execution method: ssh, aprun """ aprun_available = False try: aprun_available = (subprocess.call("aprun -n 1 /bin/date", shell=True)==0) except: pass ssh_available = False try: ssh_available = (subprocess.call("ssh localhost /bin/date", shell=True)==0) except: pass launch_method = "ssh" if requested_method=="aprun" and aprun_available == True: launch_method="aprun" elif requested_method=="ssh" and ssh_available == True: launch_method="ssh" # aprun fallback elif ssh_available==False and aprun_available==True: launch_method="aprun" logger.debug("aprun: " + str(aprun_available) + " ssh: " + str(ssh_available) + " Launch method: " + str(launch_method)) return launch_method
def __init__(self, pilot_compute_service=None, bigjob_object=None, pilot_compute_description=None, pilot_url=None): # for reconnecting """ Create/reconnect to a Pilot Compute. Keyword arguments: pilot_url -- restore from cp_id The implementation will attempt to reconnect to the PC instance referenced by the pilot_url. """ self.__subjobs = [] self.__pilot_compute_service = None if pilot_url==None: logger.debug("Create PilotCompute for BigJob: " + str(bigjob_object)) self.pilot_compute_description=pilot_compute_description self.__pilot_compute_service=pilot_compute_service self.__bigjob = bigjob_object else: logger.debug("Reconnect to an existing Pilot Compute") self.__bigjob = bigjob(pilot_url=pilot_url) # Store the URL of pilot compute service for later reference # This URL is used as central queue for a set of BJs in the # ComputeDataServiceDecentral if self.__pilot_compute_service!=None: self.coordination_queue = pilot_compute_service.coordination_queue
def __initialize_pilot_data(self): if self.pilot_data_description != None: self.service_url = self.pilot_data_description["service_url"] self.size = self.pilot_data_description["size"] # initialize file adaptor if self.service_url.startswith("ssh:"): logger.debug("Use SSH backend") self.__filemanager = SSHFileAdaptor(self.service_url) elif self.service_url.startswith("http:"): logger.debug("Use WebHDFS backend") self.__filemanager = WebHDFSFileAdaptor(self.service_url) elif self.service_url.startswith("go:"): logger.debug("Use Globus Online backend") self.__filemanager = GSFileAdaptor(self.service_url) elif self.service_url.startswith("gs:"): logger.debug("Use Google Cloud Storage backend") self.__filemanager = GSFileAdaptor(self.service_url, self.security_context) elif ( self.service_url.startswith("s3:") or self.service_url.startswith("walrus:") or self.service_url.startswith("swift:") ): logger.debug("Use Amazon S3/Eucalyptus Walrus/SWIFT Storage backend") self.__filemanager = S3FileAdaptor(self.service_url, self.security_context, self.pilot_data_description) else: raise PilotError("No File Plugin found.") self.__filemanager.initialize_pilotdata() self.__filemanager.get_pilotdata_size() # Update security context self.security_context = self.__filemanager.get_security_context()
def run(self): jd = saga.job.description() jd.arguments = ["-c", self.bootstrap_script] jd.executable = "python" jd.working_directory = self.working_directory jd.set_attribute("Interactive", "True") # Submit job js = None if self.userproxy != None and self.userproxy != '': s = saga.session() os.environ["X509_USER_PROXY"]=self.userproxy ctx = saga.context("x509") ctx.set_attribute ("UserProxy", self.userproxy) s.add_context(ctx) print "use proxy: " + self.userproxy js = saga.job.service(s, self.lrms_saga_url) else: print "use standard proxy" js = saga.job.service(self.lrms_saga_url) pbssshjob = js.create_job(jd) print "Submit pilot job to: " + str(self.lrms_saga_url) pbssshjob.run() pbssshjob.wait() outstr = pbssshjob.get_stdout().read() errstr = pbssshjob.get_stderr().read() self.job_id=(outstr).split(".")[0] logger.debug("PBS JobID: " + str(self.job_id)) if self.job_id==None or self.job_id=="": raise Exception("BigJob submission via pbs-ssh:// failed: %s %s" % (outstr,errstr))
def to_dict(self): pd_dict = {} pd_dict["id"]=self.id pd_dict["url"]=self.url pd_dict["pilot_data_description"]=self.pilot_data_description logger.debug("PS Dictionary: " + str(pd_dict)) return pd_dict
def __create_remote_directory(self, target_url): #result = urlparse.urlparse(target_url) #target_host = result.netloc #target_path = result.path # Python 2.6 compatible URL parsing scheme = target_url[:target_url.find("://")+3] target_host = target_url[len(scheme):target_url.find("/", len(scheme))] target_path = target_url[len(scheme)+len(target_host):] target_user = None if target_host.find("@")>1: comp = target_host.split("@") target_host =comp[1] target_user =comp[0] logger.debug("Create remote directory; scheme: %s, host: %s, path: %s"%(scheme, target_host, target_path)) if scheme.startswith("fork") or target_host.startswith("localhost"): os.makedirs(target_path) return True else: try: client = self.__get_ssh_client(target_host, target_user) sftp = client.open_sftp() sftp.mkdir(target_path) sftp.close() client.close() return True except: self.__print_traceback() logger.warn("Error creating directory: " + str(target_path) + " at: " + str(target_host) + " SSH password-less login activated?" ) return False
def cancel(self, pilot_url=None): logger.debug("delete job: " + self.job_url) if self.pilot_url==None: self.pilot_url = pilot_url self.bj=pilot_url_dict[pilot_url] if str(self.bj.get_state())=="Running": self.bj.delete_subjob(self.job_url)
def create_data_unit_from_dict(cls, du_dict): du = DataUnitItem() logger.debug("Restore DU: " + str(du_dict)) for i in du_dict.keys(): logger.debug("Set attribute: %s", i) du.__setattr__(i, du_dict[i]) return du
def create_pilot_data_from_dict(cls, pd_dict): pd = PilotData() for i in pd_dict.keys(): pd.__setattr__(i, pd_dict[i]) pd.initialize_pilot_data() logger.debug("created pd " + str(pd)) return pd
def create_du(self, du_id): logger.debug("create iRods collection: " + du_id) if self.is_local: command = "mkdir %s"%(os.path.join(self.localpath, du_id)) else: command = "imkdir %s"%(du_id) self.__run_command(command)
def set_pilot_state(self, pilot_url, new_state, stopped=False): pilot_url = self.get_url(pilot_url) logger.debug("create advert entry: " + pilot_url) pilot_dir = saga.advert.directory(saga.url(pilot_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) logger.debug("update state of pilot job to: " + str(new_state) + " Stopped: " + str(stopped)) pilot_dir.set_attribute("state", str(new_state)) pilot_dir.set_attribute("stopped", str(stopped))
def put_du(self, du): logging.debug("Copy DU using Globus Online") for i in du.list_data_unit_items(): remote_path = os.path.join(self.path, str(du.id), os.path.basename(i.local_url)) logging.debug("Put file: %s to %s"%(i.local_url, remote_path)) if i.local_url.startswith("ssh://"): # check if remote path is directory if self.__is_remote_directory(i.local_url): logging.warning("Path %s is a directory. Ignored."%i.local_url) continue #self.__third_party_transfer(i.local_url, remote_path) else: if stat.S_ISDIR(os.stat(i.local_url).st_mode): logging.warning("Path %s is a directory. Ignored."%i.local_url) continue result = urlparse.urlparse(i.local_url) source_host = result.netloc source_path = result.path logger.debug(str((source_host, source_path, self.host, remote_path))) if source_host == "" or source_host==None: cmd = "scp "+ source_path + " " + self.host + ":" + remote_path else: cmd = "scp "+ source_host+":"+source_path + " " + self.host + ":" + remote_path logger.debug("Command: %s"%cmd) os.system(cmd)
def create_pilot_data_from_dict(cls, ps_dict): ps = PilotData() for i in ps_dict.keys(): ps.__setattr__(i, ps_dict[i]) ps.initialize_pilot_data() logger.debug("created ps " + str(ps)) return ps
def copy_du_to_url(self, du, local_url, remote_url): base_dir = self.__get_path_for_du(du) logger.debug("copy_du_to_url, source: %s remote: %s"%(base_dir, remote_url)) if remote_url.startswith("/") and os.path.exists(base_dir): target_path = remote_url source_path = base_dir logger.debug("Target and source host are localhost. Processing: %s" %(source_path)) expanded_path = glob.glob(source_path + "/*") logger.debug("Expanded path: " + str(expanded_path)) for path in expanded_path: if os.path.isdir(path): logger.debug("Source path %s is directory"%path) files = os.listdir(path) for i in files: try: os.symlink(os.path.join(files, i), target_path) except: self.__print_traceback() else: try: os.symlink(path, os.path.join(target_path, os.path.basename(path))) except: self.__print_traceback() else: self.create_remote_directory(remote_url) for filename in self.__sftp.listdir(base_dir): file_url = local_url + "/" + filename file_remote_url = remote_url + "/" + filename logger.debug("Copy " + file_url + " to " + file_remote_url) self.__third_party_transfer_host(file_url, file_remote_url)
def wait(self): """ Waits for completion of all sub-jobs """ while 1: if self.get_state()=="Done" or self.get_state()=="Failed": logger.debug("BigJob terminated. Exit Wait") break jobs = self.coordination.get_jobs_of_pilot(self.pilot_url) finish_counter=0 result_map = {} for i in jobs: # parse job id out of sj url surl = SAGAUrl(i) sj_id = surl.path if sj_id.startswith("/"): sj_id = sj_id[1:] state = str(self.coordination.get_job_state(sj_id)) #logger.debug("SJ: %s : State: %s"%(sj_id, str(state))) #state = job_detail["state"] if result_map.has_key(state)==False: result_map[state]=1 else: result_map[state] = result_map[state]+1 if self.__has_finished(state)==True: finish_counter = finish_counter + 1 logger.debug("Total Jobs: %s States: %s"%(len(jobs), str(result_map))) if finish_counter == len(jobs): break time.sleep(2)
def __run_command(self, command): logger.debug(command) child = pexpect.spawn(command, timeout=None) output = child.readlines() logger.debug("Run %s Output: %s"%(command, str(output))) child.close() return output
def schedule_pilot_data(self, data_unit_description=None): logger.debug("Schedule to PD - # Avail pilots: %d"%len(self.pilot_data)) candidate_pilot_data = [] if data_unit_description.has_key("affinity_datacenter_label") and data_unit_description.has_key("affinity_machine_label"): for i in self.pilot_data: pilot_data_description = i.pilot_data_description if pilot_data_description.has_key("affinity_machine_label") and pilot_data_description.has_key("affinity_datacenter_label"): if data_unit_description["affinity_datacenter_label"] == pilot_data_description["affinity_datacenter_label"]\ and data_unit_description["affinity_machine_label"] == pilot_data_description["affinity_machine_label"]: candidate_pilot_data.append(i) if len(candidate_pilot_data) == 0: # No PD with requested affinity found # move data unit into a "possibly" remote pilot data logger.debug("No pilot data w/ affinity found... Looking for alternative pilot.") candidate_pilot_data = self.pilot_data if len(candidate_pilot_data)>0: return random.choice(candidate_pilot_data) return None #if len(self.pilot_data)!=0: # return random.choice(self.pilot_data) return None
def set_job_state(self, job_url, new_state): self.resource_lock.acquire() try: logger.debug("set job state to: " + str(new_state)) timestamp =time.time() if new_state=="Unknown": #self.redis_client.hset(job_url,"start_time", str(timestamp)) self.pipe.hset(job_url,"start_time", str(timestamp)) elif new_state=="Staging": self.pipe.hset(job_url,"start_staging_time", str(timestamp)) elif new_state=="Running": self.pipe.hset(job_url,"agent_start_time", str(self.redis_adaptor_start_time)) self.pipe.hset(job_url,"end_queue_time", str(timestamp)) elif new_state=="Done": self.pipe.hset(job_url, "run_host", socket.gethostname()) self.pipe.hset(job_url, "end_time", str(timestamp)) self.pipe.hset(job_url, "state", str(new_state)) # update last contact time in pilot hash pilot_url = job_url[:job_url.index(":jobs")] self.pipe.hset(pilot_url, "last_contact", str(timestamp)) # execute pipe self.pipe.execute() except: pass self.resource_lock.release()
def put_du(self, du): """Copy Data Unit to Pilot Data""" logger.debug("Put DU: %s to Pilot-Data: %s"%(du.id,self.service_url)) self.__filemanager.create_du(du.id) self.__filemanager.put_du(du) self.data_unit_urls.append(du.get_url()) CoordinationAdaptor.update_pd(self)
def dequeue_new_jobs(self): """Subscribe to new jobs from Redis. """ job_counter = 0 while self.is_stopped(self.base_url)==False: if len(self.freenodes)==0: time.sleep(3) continue logger.debug("Dequeue sub-job from PilotCompute queue: " + self.base_url) job_url=self.coordination.dequeue_job(self.base_url) logger.debug("Dequed:%s"%str(job_url)) if job_url==None: if self.cds_queue_url!=None: logger.debug("Dequeue sub-job from ComputeDataServicequeue: " + self.cds_queue_url) job_url=self.coordination.dequeue_job(self.cds_queue_url) logger.debug("Dequed:%s"%str(job_url)) if job_url==None: time.sleep(3) continue if job_url=="STOP": break job_counter = job_counter + 1 if (job_counter % (THREAD_POOL_SIZE))==0: # ensure that threadpool is not too overloaded self.threadpool.wait() request = WorkRequest(self.start_new_job_in_thread, [job_url]) self.threadpool.putRequest(request) #time.sleep(1) # wait for termination of Worker Threads # self.threadpool.wait() logger.debug("Terminating Agent - Dequeue Sub-Jobs Thread")
def __store_entry(cls, entry_url, content): entry_url = cls.__get_url(entry_url) redis_client = cls.__get_redis_api_client() redis_client.hmset(entry_url, content) logger.debug("Store Redis entry at: " + entry_url + " Content: " + str(json.dumps(content)))
def __init__(self, pilot_data=None, data_unit_description=None, du_url=None): """ 1.) create a new Pilot Data: pilot_data_service and data_unit_description required 2.) reconnect to an existing Pilot Data: du_url required """ if du_url==None: self.id = self.DU_ID_PREFIX + str(uuid.uuid1()) self.data_unit_description = data_unit_description self.pilot_data=[] self.state = State.New self.data_unit_items=[] if self.data_unit_description.has_key("file_urls"): self.data_unit_items = DataUnitItem.create_data_unit_list(self, self.data_unit_description["file_urls"]) self.url = None # register a data unit as top-level entry in Redis application_url = CoordinationAdaptor.get_base_url(application_id) self.url = CoordinationAdaptor.add_du(application_url, self) CoordinationAdaptor.update_du(self) # Deprecated # old method only allowed the creation of a du if a pd existed #if pilot_data!=None: # # Allow data units that are not connected to a resource! # self.url = CoordinationAdaptor.add_du(pilot_data.url, self) # CoordinationAdaptor.update_du(self) else: self.id = DataUnit._get_du_id(du_url) self.url = du_url logger.debug("Restore du: %s"%self.id) self.__restore_state() self.transfer_threads=[]
def _scheduler_thread(self): while True and self.stop.isSet()==False: try: #logger.debug("Scheduler Thread: " + str(self.__class__) + " Pilot Data") du = self.du_queue.get(True, 1) # check whether this is a real du object if isinstance(du, DataUnit): pd=self._schedule_du(du) if(pd!=None): logger.debug("Initiate Transfer to PD.") du.add_pilot_data(pd) #logger.debug("Transfer to PD finished.") #du._update_state(State.Running) #logger.debug("Updated State to Running.") self.du_queue.task_done() else: self.du_queue.task_done() self.du_queue.put(du) except Queue.Empty: pass if self.du_queue.empty(): time.sleep(5) logger.debug("Re-Scheduler terminated")
def __init__(self, cds_url=None): """ Create a ComputeDataService (Decentral) object. @param cds_url: Reconnect to an existing CDS (optional). """ # Pilot Data self.data_units={} self.pilot_data_services=[] # Pilot Compute self.compute_units={} self.pilot_job_services=[] if cds_url == None: self.id=self.CDS_ID_PREFIX + str(uuid.uuid1()) application_url = CoordinationAdaptor.get_base_url(pilot.application_id) self.url = CoordinationAdaptor.add_cds(application_url, self) else: self.id = self.__get_cds_id(cds_url) self.url = cds_url # Background Thread for scheduling self.scheduler = Scheduler() self.du_queue = Queue.Queue() self.stop=threading.Event() self.scheduler_thread=threading.Thread(target=self._scheduler_thread) self.scheduler_thread.daemon=True self.scheduler_thread.start() logger.debug("Created ComputeDataServiceDecentral")
def __update_scheduler_resources(self): logger.debug("__update_scheduler_resources") pd = [s for i in self.pilot_data_services for s in i.list_pilots()] self.scheduler.set_pilot_data(pd) pj = [p for i in self.pilot_job_services for p in i.list_pilots()] logger.debug("Pilot-Jobs: " + str(pj)) self.scheduler.set_pilot_jobs(pj)
def __escape_bliss(self, bootstrap_script): logger.debug("Escape Bliss") #bootstrap_script = bootstrap_script.replace("\'", "\"") #bootstrap_script = "\'" + bootstrap_script+ "\'" bootstrap_script = bootstrap_script.replace('"','\\"') bootstrap_script = '"' + bootstrap_script+ '"' return bootstrap_script
def start_background_thread(self): self.stop = False logger.debug("##################################### New POLL/MONITOR cycle ##################################") while True and self.stop == False: logger.debug( "Free nodes: " + str(len(self.freenodes)) + " Busy Nodes: " + str(len(self.busynodes)) + " Number of running sub-jobs: " + str(len(self.jobs)) ) if self.is_stopped(self.base_url) == True: logger.debug("Pilot terminated.") break else: logger.debug("Pilot job entry: " + str(self.base_url) + " exists. Pilot job not in state stopped.") try: # self.poll_jobs() self.monitor_jobs() time.sleep(5) self.failed_polls = 0 except: traceback.print_exc(file=sys.stdout) self.failed_polls = self.failed_polls + 1 if self.failed_polls > 3: # after 3 failed attempts exit break logger.debug("Terminating Agent - Background Thread")
def __get_launch_method(self, requested_method): """ returns desired execution method: ssh, aprun """ aprun_available = False try: aprun_available = (subprocess.call("aprun -n 1 /bin/date", shell=True, stdout=None, stderr=None)==0) except: self.__print_traceback() ibrun_available = self.__ibrun_available() ssh_available = False try: ssh_available = (subprocess.call("ssh -o PasswordAuthentication=no -o NumberOfPasswordPrompts=0 localhost /bin/date", shell=True, stdout=None, stderr=None)==0) except: pass launch_method = "local" if requested_method=="aprun" and aprun_available == True: launch_method="aprun" elif ibrun_available == True: launch_method="ibrun" elif requested_method=="ssh" and ssh_available == True: launch_method="ssh" # aprun fallback elif ssh_available==False and aprun_available==True: launch_method="aprun" logger.debug("aprun: " + str(aprun_available) + " ibrun: " + str(ibrun_available) + " ssh: " + str(ssh_available) + " Launch method: " + str(launch_method)) return launch_method
def set_job_state(self, job_url, new_state): self.resource_lock.acquire() job_url = self.get_url(job_url) logger.debug("Set state of job: " + str(job_url) + " to: " + str(new_state)) job_dir = saga.advert.directory(saga.url(job_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite) job_dir.set_attribute("state", str(new_state)) self.resource_lock.release()
def list_pd(cls, pds_url): """ return a list of urls to pd managed by the PDS """ pds_url = cls.__get_url(pds_url) logger.debug("List PD at %s" % pds_url)
def __initialize_pilot_data(self): if self.pilot_data_description != None: self.service_url = self.pilot_data_description["service_url"] if self.pilot_data_description.has_key("size"): self.size = self.pilot_data_description["size"] # initialize file adaptor if self.service_url.startswith("ssh:"): logger.debug("Use SSH backend") self.__filemanager = SSHFileAdaptor( self.service_url, self.security_context, self.pilot_data_description) elif self.service_url.startswith("http:"): logger.debug("Use WebHDFS backend") self.__filemanager = WebHDFSFileAdaptor(self.service_url) elif self.service_url.startswith("go:"): logger.debug("Use Globus Online backend") self.__filemanager = GlobusOnlineFileAdaptor(self.service_url) elif self.service_url.startswith("gs:"): logger.debug("Use Google Cloud Storage backend") self.__filemanager = GSFileAdaptor(self.service_url, self.security_context) elif self.service_url.startswith("irods:"): logger.debug("Use iRods Storage backend") self.__filemanager = iRodsFileAdaptor(self.service_url, self.security_context) elif self.service_url.startswith("s3:") \ or self.service_url.startswith("walrus:") \ or self.service_url.startswith("swift:"): logger.debug( "Use Amazon S3/Eucalyptus Walrus/SWIFT Storage backend") self.__filemanager = S3FileAdaptor(self.service_url, self.security_context, self.pilot_data_description) else: raise PilotError("No File Plugin found.") self.__filemanager.initialize_pilotdata() self.__filemanager.get_pilotdata_size() # Update security context self.security_context = self.__filemanager.get_security_context()
def __init__(self, args): self.coordination_url = args[1] # objects to store running jobs and processes self.jobs = [] self.processes = {} self.freenodes = [] self.busynodes = [] self.restarted = {} # read config file conf_file = os.path.dirname( os.path.abspath(__file__)) + "/../" + CONFIG_FILE if not os.path.exists(conf_file): conf_file = os.path.join(sys.prefix, CONFIG_FILE) logging.debug("read configfile: " + conf_file) config = ConfigParser.ConfigParser() config.read(conf_file) default_dict = config.defaults() self.CPR = False if default_dict.has_key("cpr"): self.CPR = default_dict["cpr"] self.SHELL = "/bin/bash" if default_dict.has_key("shell"): self.SHELL = default_dict["shell"] self.MPIRUN = "mpirun" if default_dict.has_key("mpirun"): self.MPIRUN = default_dict["mpirun"] self.OUTPUT_TAR = False if default_dict.has_key("create_output_tar"): self.OUTPUT_TAR = eval(default_dict["create_output_tar"]) logger.debug("Create output tar: %r", self.OUTPUT_TAR) self.LAUNCH_METHOD = "ssh" if default_dict.has_key("launch_method"): self.LAUNCH_METHOD = self.__get_launch_method( default_dict["launch_method"]) logging.debug("Launch Method: " + self.LAUNCH_METHOD + " mpi: " + self.MPIRUN + " shell: " + self.SHELL) # init rms (SGE/PBS) self.init_rms() self.failed_polls = 0 ############################################################################## # initialization of coordination and communication subsystem # Redis initialization self.base_url = args[2] self.cds_queue_url = None if len(args) == 4: self.cds_queue_url = args[3] logger.debug("External queue: " + str(self.cds_queue_url)) self.id = self.__get_bj_id(self.base_url) logger.debug("BigJob Agent arguments: " + str(args)) logger.debug("Initialize C&C subsystem to pilot-url: " + self.base_url) logger.debug("BigJob ID: %s" % self.id) # create bj directory self.work_dir = os.getcwd() if self.work_dir.find( self.id) == -1: # working directory already contains BJ id self.bj_dir = os.path.join(os.getcwd(), self.id) logger.debug("Agent working directory: %s" % self.bj_dir) try: os.makedirs(self.bj_dir) except: logger.debug("Directory already exists.") else: self.bj_dir = os.getcwd() os.chdir(self.bj_dir) if (self.coordination_url.startswith("advert://") or self.coordination_url.startswith("sqlasyncadvert://")): try: from coordination.bigjob_coordination_advert import bigjob_coordination logging.debug("Utilizing ADVERT Backend: " + self.coordination_url) except: logger.error("Advert Backend could not be loaded") exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exc(file=sys.stderr) traceback.print_tb(exc_traceback, file=sys.stderr) elif (self.coordination_url.startswith("redis://")): try: from coordination.bigjob_coordination_redis import bigjob_coordination logger.debug( "Utilizing Redis Backend: " + self.coordination_url + ". Please make sure Redis server is configured in bigjob_coordination_redis.py" ) except: logger.error("Error loading pyredis.") elif (self.coordination_url.startswith("tcp://")): try: from coordination.bigjob_coordination_zmq import bigjob_coordination logger.debug("Utilizing ZMQ Backend") except: logger.error( "ZMQ Backend not found. Please install ZeroMQ (http://www.zeromq.org/intro:get-the-software) and " + "PYZMQ (http://zeromq.github.com/pyzmq/)") ### # Initiate coordination sub-system of both BJ agent and Pilot Data self.coordination = bigjob_coordination( server_connect_url=self.coordination_url) self.pilot_data_service = PilotDataService( coordination_url=self.coordination_url) # update state of pilot job to running logger.debug("set state to : " + str(bigjob.state.Running)) self.coordination.set_pilot_state(self.base_url, str(bigjob.state.Running), False) self.pilot_description = self.coordination.get_pilot_description( self.base_url) ############################################################################## # start background thread for polling new jobs and monitoring current jobs self.resource_lock = threading.RLock() self.threadpool = ThreadPool(THREAD_POOL_SIZE) self.launcher_thread = threading.Thread(target=self.dequeue_new_jobs) self.launcher_thread.start() self.monitoring_thread = threading.Thread( target=self.start_background_thread) self.monitoring_thread.start()
def __get_bj_id(self, url): logger.debug("parsing ID out of URL: %s" % url) start = url.index("bj-") end = url.index(":", start) bj_id = url[start:end] return bj_id
def update_du_state(cls, du, state): logger.debug("**** Update data unit STATE at: " + du.url + " to: " + str(state)) cls.__store_entry_item( du.url + RedisCoordinationAdaptor.SEPARATOR + "info", "state", state)
def run(self): """ Start VM and start BJ agent via SSH on VM """ """ Map fields of Pilot description to EC2 API { "vm_id":"ami-d7f742be", "vm_ssh_username":"******", "vm_ssh_keyname":"MyKey", "vm_ssh_keyfile":"<path>", "vm_type":"t1.micro", "access_key_id":"xxx", "secret_access_key":"xxx" } """ reservation = self.ec2_conn.run_instances(self.pilot_compute_description["vm_id"], key_name=self.pilot_compute_description["vm_ssh_keyname"], instance_type=self.pilot_compute_description["vm_type"], security_groups=[SECURITY_GROUP]) self.instance = reservation.instances[0] self.instance_id = self.instance.id logger.debug("Started EC2/Eucalyptus/Nova instance: %s"%self.instance_id) time.sleep(5) self.wait_for_running() if self.resource_url.scheme != "euca+ssh" and self.resource_url.scheme != "nova+ssh": self.ec2_conn.create_tags([self.instance_id], {"Name": self.id}) self.network_ip = self.instance.ip_address url = "ssh://" + str(self.network_ip) logger.debug("Connect to: %s"%(url)) # Submit job ctx = saga.Context("SSH") #ctx.type = saga.Context.SSH ctx.user_id = self.pilot_compute_description["vm_ssh_username"] ctx.user_key = self.pilot_compute_description["vm_ssh_keyfile"] session = saga.Session() session.add_context(ctx) TRIAL_MAX=30 trials=0 while trials < TRIAL_MAX: try: js = saga.job.Service(url, session=session) logger.debug("Job Description Type: " + str(type(self.job_description))) job = js.create_job(self.job_description) logger.debug("Attempt: %d, submit pilot job to: %s "%(trials,str(url))) job.run() if job.get_state()==saga.job.FAILED: logger.warning("Submission failed.") trials = trials + 1 time.sleep(30) continue else: break except: exc_type, exc_value, exc_traceback = sys.exc_info() logger.warning("Submission failed: " + str(exc_value)) #self.__print_traceback() trials = trials + 1 time.sleep(30) if trials == TRIAL_MAX: raise Exception("Submission of agent failed.") logger.debug("Job State : %s" % (job.get_state()))
logging.basicConfig(level=logging.DEBUG) try: import saga except: logging.debug("SAGA not imported. ") logging.debug(str(sys.path)) from threadpool import * # BigJob/Pilot framework classes from bigjob import logger from pilot.impl.pilotdata_manager import PilotData, DataUnit, PilotDataService logger.debug("Python Version: " + str(sys.version_info)) if sys.version_info < (2, 5): sys.stderr.write("Warning: Using unsupported Python version\n") if sys.version_info < (2, 4): sys.stderr.write("Warning: Using unsupported Python version\n") if sys.version_info < (2, 3): sys.stderr.write("Warning: Python versions <2.3 not supported\n") sys.exit(-1) import subprocess """ Config parameters (will move to config file in future) """ CONFIG_FILE = "bigjob_agent.conf" THREAD_POOL_SIZE = 4 APPLICATION_NAME = "bigjob"
def list_du(cls, pd_url): """ return a list of urls to du managed by the PDS """ pd_url = cls.__get_url(pd_url) logger.debug("List Data-Units of Pilot-Data at %s" % pd_url) dus = cls.__list_keys(pd_url + ":du-*") return dus
def execute_job(self, job_url, job_dict): """ obtain job attributes from c&c and execute process """ state = str(job_dict["state"]) if (state == str(bigjob.state.Unknown) or state == str(bigjob.state.New)): try: #job_dict["state"]=str(saga.job.New) job_id = job_dict["job-id"] logger.debug("Start job id %s specification %s: " % (job_id, str(job_dict))) numberofprocesses = "1" try: if (job_dict.has_key("NumberOfProcesses") == True): numberofprocesses = job_dict["NumberOfProcesses"] except: pass # ignore in particular if Bliss is used spmdvariation = "single" try: if (job_dict.has_key("SPMDVariation") == True): spmdvariation = job_dict["SPMDVariation"] except: pass # ignore in particular if Bliss is used arguments = "" if (job_dict.has_key("Arguments") == True): arguments_raw = job_dict['Arguments'] if type(arguments_raw) == types.ListType: arguments_list = arguments_raw else: arguments_list = eval(job_dict["Arguments"]) for i in arguments_list: arguments = arguments + " " + i environment = os.environ envi = "" self.number_subjobs = 1 if (job_dict.has_key("Environment") == True): env_raw = job_dict['Environment'] if type(env_raw) == types.ListType: env_list = env_raw else: env_list = eval(job_dict["Environment"]) logger.debug("Environment: " + str(env_list)) for i in env_list: logger.debug("Eval " + i) # Hack for conduction experiments on Kraken # Kraken specific support for running n sub-jobs at a time if i.startswith("NUMBER_SUBJOBS"): self.number_subjobs = int(i.split("=")[1].strip()) logger.debug("NUMBER_SUBJOBS: " + str(self.number_subjobs)) else: envi_1 = "export " + i + "; " envi = envi + envi_1 logger.debug(envi) executable = job_dict["Executable"] executable = self.__expand_directory(executable) workingdirectory = os.path.join(os.getcwd(), job_id) if (job_dict.has_key("WorkingDirectory") == True): workingdirectory = job_dict["WorkingDirectory"] workingdirectory = self.__expand_directory( workingdirectory) try: os.makedirs(workingdirectory) except: logger.debug("Directory %s already exists." % workingdirectory) logging.debug("Sub-Job: %s, Working_directory: %s" % (job_id, workingdirectory)) output = "stdout" if (job_dict.has_key("Output") == True): output = job_dict["Output"] if not os.path.isabs(output): output = os.path.join(workingdirectory, output) error = os.path.join(workingdirectory, "stderr") if (job_dict.has_key("Error") == True): error = job_dict["Error"] if not os.path.isabs(error): error = os.path.join(workingdirectory, error) # append job to job list self.jobs.append(job_url) # File Stage-In of dependent data units if job_dict.has_key("InputData"): self.__stage_in_data_units(eval(job_dict["InputData"]), workingdirectory) # File Stage-In - Move pilot-level files to working directory of sub-job if self.pilot_description != None: try: if self.pilot_description.has_key("description"): file_list = eval( self.pilot_description["description"]) if file_list != None and len(file_list) > 0: logger.debug("Copy %d files to SJ work dir" % len(file_list) > 0) for i in file_list: logger.debug("Process file: %s" % i) if i.find(">") > 0: base_filename = os.path.basename( i[:i.index(">")].strip()) if environment.has_key( "_CONDOR_SCRATCH_DIR"): source_filename = os.path.join( environment[ "_CONDOR_SCRATCH_DIR"], base_filename) else: source_filename = os.path.join( self.work_dir, base_filename) target_filename = os.path.join( workingdirectory, base_filename) try: logger.debug("Copy: %s to %s" % (source_filename, target_filename)) shutil.copyfile( source_filename, target_filename) except: logger.error( "Error copy: %s to %s" % (source_filename, target_filename)) except: logger.debug("Moving of stage-in files failed.") # create stdout/stderr file descriptors output_file = os.path.abspath(output) error_file = os.path.abspath(error) logger.debug("stdout: " + output_file + " stderr: " + error_file) stdout = open(output_file, "w") stderr = open(error_file, "w") if self.LAUNCH_METHOD == "aprun": if (spmdvariation.lower() == "mpi"): command = envi + "aprun -n " + str( numberofprocesses ) + " " + executable + " " + arguments else: #env_strip = envi.strip() #env_command = env_strip[:(len(env_strip)-1)] command = envi + "aprun -n " + str( self.number_subjobs ) + " -d " + numberofprocesses + " " + executable + " " + arguments # MPMD Mode => all subjobs on Kraken fail because aprun returns 1 as returncode #command = "aprun" #for i in range(0, self.number_subjobs): # command = command + " -d " + numberofprocesses + " " + executable + " " + arguments # # + " 1 > "+ str(i)+ "-out.txt " + " 2 > "+ str(i)+ "-err.txt" # if i != self.number_subjobs-1: # command = command + " : " elif (spmdvariation.lower() != "mpi"): command = envi + executable + " " + arguments # In particular for Condor - if executable is staged x flag is not set #command ="chmod +x " + executable +";export PATH=$PATH:" + workingdirectory + ";" +command else: # Environment variables need to be handled later! command = envi + executable + " " + arguments # special setup for MPI NAMD jobs machinefile = self.allocate_nodes(job_dict) host = "localhost" try: machine_file_handler = open(machinefile, "r") node = machine_file_handler.readlines() machine_file_handler.close() host = node[0].strip() except: pass if (machinefile == None): logger.debug("Not enough resources to run: " + job_url) self.coordination.queue_job(self.base_url, job_url) return # job cannot be run at the moment # build execution command if self.LAUNCH_METHOD == "aprun": command = "cd " + workingdirectory + "; " + command elif self.LAUNCH_METHOD == "local": command = "cd " + workingdirectory + "; " + command else: # ssh launch is default if (spmdvariation.lower() == "mpi"): command = "cd " + workingdirectory + "; " + envi + self.MPIRUN + " -np " + numberofprocesses + " -machinefile " + machinefile + " " + command elif host == "localhost": command = "cd " + workingdirectory + "; " + command else: command = "ssh " + host + " \'cd " + workingdirectory + "; " + command + "\'" # start application process shell = self.SHELL logger.debug("execute: " + command + " in " + workingdirectory + " from: " + str(socket.gethostname()) + " (Shell: " + shell + ")") # bash works fine for launching on QB but fails for Abe :-( p = subprocess.Popen(args=command, executable=shell, stderr=stderr, stdout=stdout, cwd=workingdirectory, env=environment, shell=True) logger.debug("started " + command) self.processes[job_url] = p self.coordination.set_job_state(job_url, str(bigjob.state.Running)) except: traceback.print_exc(file=sys.stderr)
# create empty data unit for output data output_data_unit_description = {"file_urls": []} output_data_unit = pd.submit_data_unit(output_data_unit_description) output_data_unit.wait() # create compute unit compute_unit_description = { "executable": "/bin/cat", "arguments": ["test.txt"], "number_of_processes": 1, "output": "stdout.txt", "error": "stderr.txt", "input_data": [input_data_unit.get_url()], # Put files stdout.txt and stderr.txt into output data unit "output_data": [{ output_data_unit.get_url(): ["std*"] }] } compute_unit = compute_data_service.submit_compute_unit( compute_unit_description) logger.info( "Finished setup of ComputeDataService. Waiting for scheduling of PD") compute_data_service.wait() logger.debug("Output Data Unit: " + str(output_data_unit.list())) logger.info("Terminate Pilot Compute/Data Service") compute_data_service.cancel() pilot_data_service.cancel() pilot_compute_service.cancel()
def print_machine_file(self, filename): fh = open(filename, "r") lines = fh.readlines() fh.close logger.debug("Machinefile: " + filename + " Hosts: " + str(lines))
def test_du_reconnect(): du_url = "redis://localhost/bigdata:du-1d1b7078-229f-11e2-834e-705681b3df0f" du = DataUnit(du_url=du_url) logger.debug(str(du.list())) du.export("/tmp/export-test")
def put_progress(self, transfered_bytes, total_bytes): logger.debug("Bytes transfered %d/%d" % (transfered_bytes, total_bytes))
def __print_traceback(self): exc_type, exc_value, exc_traceback = sys.exc_info() logger.debug("*** print_exception:", exc_info=(exc_type, exc_value, exc_traceback))
def __escape_ssh(self, bootstrap_script): logger.debug("Escape SSH") bootstrap_script = bootstrap_script.replace("\"", "\\\"") bootstrap_script = bootstrap_script.replace("\'", "\\\"") bootstrap_script = "\"" + bootstrap_script + "\"" return bootstrap_script
def get_pd(cls, pd_url): logger.debug("GET PD: " + pd_url) pd_dict = cls.__retrieve_entry(pd_url + RedisCoordinationAdaptor.SEPARATOR + "info") return pd_dict
def __initialize_pilot_data(self, service_url): # initialize file adaptor # Pilot Data API for File Management if service_url.startswith("ssh:"): logger.debug("Use SSH backend for PilotData") try: from pilot.filemanagement.ssh_adaptor import SSHFileAdaptor self.__filemanager = SSHFileAdaptor(service_url) except: logger.debug("SSH package not found.") self.__print_traceback() elif service_url.startswith("http:"): logger.debug("Use WebHDFS backend") try: from pilot.filemanagement.webhdfs_adaptor import WebHDFSFileAdaptor self.__filemanager = WebHDFSFileAdaptor(service_url) except: logger.debug("WebHDFS package not found.") elif service_url.startswith("go:"): logger.debug("Use Globus Online backend") try: from pilot.filemanagement.globusonline_adaptor import GlobusOnlineFileAdaptor self.__filemanager = GlobusOnlineFileAdaptor(service_url) except: logger.debug("Globus Online package not found.") self.__print_traceback()
def export_du(self, du, target_url): """ Export Data Unit to a local directory """ if target_url.startswith("/") and os.path.exists(target_url) == False: os.mkdir(target_url) logger.debug("Export Data-Unit to %s" % target_url) self.__filemanager.get_du(du, target_url)
def __escape_rsl(self, bootstrap_script): logger.debug("Escape RSL") bootstrap_script = bootstrap_script.replace("\"", "\"\"") return bootstrap_script
def cancel(self): """ duck typing for cancel of saga.cpr.job and saga.job.job """ logger.debug("Cancel Pilot Job") try: self.job.cancel() except: pass #traceback.print_stack() logger.debug("Cancel Job Service") try: if not self._ocache.rem_obj(self.js): logger.debug("Cancel Job Service") del (self.js) else: logger.debug("Cancel Job Service done") self.js = None except: pass #traceback.print_stack() try: self._stop_pilot_job() logger.debug("delete pilot job: " + str(self.pilot_url)) if _CLEANUP: self.coordination.delete_pilot(self.pilot_url) #os.remove(os.path.join("/tmp", "bootstrap-"+str(self.uuid))) except: pass #traceback.print_stack() logger.debug("Cancel Pilot Job finished")
def _get_subjob_state(self, job_url): logger.debug("Get subjob state: " + str(job_url)) return self.coordination.get_job_state(job_url)
def __escape_pbs(self, bootstrap_script): logger.debug("Escape bootstrap script") bootstrap_script = "\'" + bootstrap_script + "\'" return bootstrap_script
Encapsulates coordination and communication specifics of bigjob ''' import threading import datetime import time import sys import os import pickle import pdb import saga import json import urlparse import logging from bigjob import logger logger.debug("Load Advert Coordination") if sys.version_info < (2, 5): sys.path.append( os.path.dirname(os.path.abspath(__file__)) + "/../ext/uuid-1.30/") sys.stderr.write("Warning: Using unsupported Python version\n") logging.debug(str(sys.path)) import uuid APPLICATION_NAME = "BigJob/BigJob" ADVERT_URL_SCHEME = "advert://" ADVERT_SERVER = "advert.cct.lsu.edu" ADVERT_SERVER_PORT = 8080
def get_pd(cls, pds_url): logger.debug("GET PD: " + pds_url) pd_dict={}
def get_du(self, du, target_url): #du_id = "du-7370d7b5-ed0b-11e1-95df-705681b3df0f" start = time.time() du_id = du.id logger.debug("Get DU: " + str(du_id)) if self.is_local: command = "cp -r %s %s" % (os.path.join(self.localpath, du_id), target_url) source_path = os.path.join(self.localpath, du_id, "*") target_path = target_url logger.debug( "Target and source host are localhost. Processing: %s" % (source_path)) expanded_path = glob.glob(source_path) logger.debug("Expanded path: " + str(expanded_path)) for path in expanded_path: if os.path.isdir(path): logger.debug("Source path %s is directory" % path) files = os.listdir(path) for i in files: try: os.symlink(os.path.join(files, i), target_path) os.chmod( os.path.join(target_path, os.path.basename(path)), 0777) except: self.__print_traceback() else: try: os.symlink( path, os.path.join(target_path, os.path.basename(path))) os.chmod( os.path.join(target_path, os.path.basename(path)), 0777) except: self.__print_traceback() else: command = "iget -f -r %s %s" % (du_id, target_url) logger.debug(command) self.__run_command(command) full_path = os.path.join(target_url, du_id) #logger.debug("Path: " + str(full_path) + " Exists: " + str(os.path.exists(full_path))) #while os.path.exists(full_path)==False: # time.sleep(1) for i in os.listdir(full_path): try: logger.debug("chmod " + str(i)) os.chmod(os.path.join(full_path, i), 0777) logger.debug("move " + str(i)) shutil.move(os.path.join(full_path, i), target_url) except: self.__print_traceback() shutil.rmtree(full_path, ignore_errors=True) #time.sleep(2) #if target_url==".": # target_url = os.getcwd() #command = "mv %s/* %s"%(os.path.join(target_url, du_id), target_url) #self.__run_command(command) logger.debug("Finished Get DU " + du.id + " in: " + str(time.time() - start) + " sec.")
def start_pilot_job(self, lrms_url, number_nodes=1, queue=None, project=None, working_directory=None, userproxy=None, walltime=None, processes_per_node=1, filetransfers=None, spmd_variation=None, external_queue="", pilot_compute_description=None): """ Start a batch job (using SAGA Job API) at resource manager. Currently, the following resource manager are supported: fork://localhost/ (Default Job Adaptor gram://qb1.loni.org/jobmanager-pbs (Globus Adaptor) pbspro://localhost (PBS Pro Adaptor) """ if self.job != None: raise BigJobError( "One BigJob already active. Please stop BigJob first.") return ############################################################################## # initialization of coordination and communication subsystem # Communication & Coordination initialization lrms_saga_url = SAGAUrl(lrms_url) self.url = lrms_saga_url self.pilot_url = self.app_url + ":" + lrms_saga_url.host self.number_nodes = int(number_nodes) * int(processes_per_node) # Store references to BJ in global dict _pilot_url_dict[self.pilot_url] = self _pilot_url_dict[external_queue] = self logger.debug("create pilot job entry on backend server: " + self.pilot_url) self.coordination.set_pilot_state(self.pilot_url, str(Unknown), False) if pilot_compute_description == None: pilot_compute_description = { "service_url": lrms_url, "number_of_processes": number_nodes, "processes_per_node": processes_per_node, "working_directory": working_directory } self.coordination.set_pilot_description(self.pilot_url, pilot_compute_description) logger.debug("set pilot state to: " + str(Unknown)) # Create Job Service (Default: SAGA Job Service, alternative Job Services supported) self.js = None if lrms_saga_url.scheme == "gce+ssh": self.js = GCEService(lrms_saga_url, pilot_compute_description) elif lrms_saga_url.scheme=="ec2+ssh" or lrms_saga_url.scheme=="euca+ssh" \ or lrms_saga_url.scheme=="nova+ssh": self.js = EC2Service(lrms_saga_url, pilot_compute_description) #elif lrms_saga_url.scheme=="slurm+ssh": # self.js = SlurmService(lrms_saga_url, pilot_compute_description) else: self.js = self._ocache.get_obj( lrms_saga_url, lambda: SAGAJobService(lrms_saga_url)) ############################################################################## # create job description jd = SAGAJobDescription() # Attempt to create working directory (e.g. in local scenario) if working_directory != None and working_directory != "": if not os.path.isdir(working_directory) \ and (lrms_saga_url.scheme.startswith("fork") or lrms_saga_url.scheme.startswith("condor")) \ and working_directory.startswith("go:")==False: os.mkdir(working_directory) self.working_directory = working_directory else: # if no working dir is set assume use home directory # will fail if home directory is not the same on remote machine # but this is just a guess to avoid failing self.working_directory = "~" #self.working_directory = "" if queue != None: jd.queue = queue if spmd_variation != None: jd.spmd_variation = spmd_variation if project != None: jd.project = project if walltime != None: logger.debug("setting walltime to: " + str(walltime)) jd.wall_time_limit = int(walltime) ############################################################################## # File Management and Stage-In # Determine whether target machine use gsissh or ssh to logon. # logger.debug("Detect launch method for: " + lrms_saga_url.host) # self.launch_method = self.__get_launch_method(lrms_saga_url.host,lrms_saga_url.username) self.bigjob_working_directory_url = "" if lrms_saga_url.scheme.startswith("gce") or lrms_saga_url.scheme.startswith("ec2")\ or lrms_saga_url.scheme.startswith("euca") or lrms_saga_url.scheme.startswith("nova"): logger.debug( "File Staging for Cloud Instances currently not supported.") elif lrms_saga_url.scheme.startswith("condor") == True: logger.debug("Using Condor file staging") else: # build target url for working directory # this will also create the remote directory for the BJ # Fallback if working directory is not a valid URL if not (self.working_directory.startswith("go:") or self.working_directory.startswith("ssh://")): if lrms_saga_url.username != None and lrms_saga_url.username != "": self.bigjob_working_directory_url = "ssh://" + lrms_saga_url.username + "@" + lrms_saga_url.host + "/" + self.__get_bigjob_working_dir( ) else: self.bigjob_working_directory_url = "ssh://" + lrms_saga_url.host + "/" + self.__get_bigjob_working_dir( ) elif self.working_directory.startswith("go:"): self.bigjob_working_directory_url = os.path.join( self.working_directory, self.uuid) else: # working directory is a valid file staging URL self.bigjob_working_directory_url = self.working_directory # initialize file manager that takes care of file movement and directory creation if self.__filemanager == None: self.__initialize_pilot_data( self.bigjob_working_directory_url) # determines the url if self.__filemanager != None and not self.working_directory.startswith( "/"): self.working_directory = self.__filemanager.get_path( self.bigjob_working_directory_url) # determine working directory of bigjob # if a remote sandbox can be created via ssh => create a own dir for each bj job id # otherwise use specified working directory logger.debug("BigJob working directory: %s" % self.bigjob_working_directory_url) if self.__filemanager != None and self.__filemanager.create_remote_directory( self.bigjob_working_directory_url) == True: self.working_directory = self.__get_bigjob_working_dir() self.__stage_files(filetransfers, self.bigjob_working_directory_url) else: logger.warn("No file staging adaptor found.") logger.debug("BJ Working Directory: %s", self.working_directory) if lrms_saga_url.scheme.startswith("condor") == False: jd.working_directory = self.working_directory else: jd.working_directory = "" ############################################################################## # Create and process BJ bootstrap script bootstrap_script = self.__generate_bootstrap_script( self.coordination.get_address(), self.pilot_url, # Queue 1 used by this BJ object external_queue # Queue 2 used by Pilot Compute Service # or another external scheduler ) logger.debug("Adaptor specific modifications: " + str(lrms_saga_url.scheme)) bootstrap_script = self.__escape_pbs(bootstrap_script) #bootstrap_script = self.__escape_ssh(bootstrap_script) logger.debug(bootstrap_script) # Define Agent Executable in Job description # in Condor case bootstrap script is staged # (Python app cannot be passed inline in Condor job description) if lrms_saga_url.scheme.startswith("condor") == True: bootstrap_script = self.__generate_bootstrap_script_from_binary( self.coordination.get_address(), self.pilot_url, # Queue 1 used by this BJ object external_queue # Queue 2 used by Pilot Compute Service # or another external scheduler ) condor_bootstrap_filename = os.path.join( "/tmp", "bootstrap-" + str(self.uuid)) condor_bootstrap_file = open(condor_bootstrap_filename, "w") condor_bootstrap_file.write(bootstrap_script) condor_bootstrap_file.close() logger.debug("Using Condor - bootstrap file: " + condor_bootstrap_filename) jd.executable = "/usr/bin/env" jd.arguments = [ "python", os.path.basename(condor_bootstrap_filename) ] if pilot_compute_description.has_key("candidate_hosts"): jd.candidate_hosts = pilot_compute_description[ "candidate_hosts"] bj_file_transfers = [] file_transfer_spec = condor_bootstrap_filename + " > " + os.path.basename( condor_bootstrap_filename) bj_file_transfers.append(file_transfer_spec) output_file_name = "output-" + str(self.uuid) + ".tar.gz" #output_file_transfer_spec = os.path.join(self.working_directory, output_file_name) +" < " + output_file_name output_file_transfer_spec = output_file_name + " < " + output_file_name #output_file_transfer_spec = os.path.join(self.working_directory, "output.tar.gz") +" < output.tar.gz" #logger.debug("Output transfer: " + output_file_transfer_spec) #bj_file_transfers.append(output_file_transfer_spec) if filetransfers != None: for t in filetransfers: bj_file_transfers.append(t) logger.debug("Condor file transfers: " + str(bj_file_transfers)) jd.file_transfer = bj_file_transfers else: jd.total_cpu_count = int(number_nodes) jd.spmd_variation = "single" if pilot_compute_description != None and pilot_compute_description.has_key( "spmd_variation"): jd.spmd_variation = pilot_compute_description["spmd_variation"] jd.arguments = ["python", "-c", bootstrap_script] jd.executable = "/usr/bin/env" logger.debug("Working directory: " + jd.working_directory + " Job Description: " + str(jd)) jd.output = os.path.join(self.working_directory, "stdout-" + self.uuid + "-agent.txt") jd.error = os.path.join(self.working_directory, "stderr-" + self.uuid + "-agent.txt") ############################################################################## # Create and submit pilot job to job service logger.debug("Creating pilot job with description: %s" % str(jd)) self.job = self.js.create_job(jd) logger.debug("Trying to submit pilot job to: " + str(lrms_saga_url)) self.job.run() if self.job.get_state() == saga.job.FAILED: logger.debug("SUBMISSION FAILED. Exiting... ") sys.exit(-1) else: logger.debug("Submission succeeded. Job ID: %s " % self.job.id) return self.pilot_url
def get_cds_url(cls, application_url, cds_id): cds_url = application_url + RedisCoordinationAdaptor.SEPARATOR + cds_id logger.debug("CDS URL: %s" % (cds_url)) return cds_url
def add_pds(cls, application_url, pds): pds_url_no_dbtype = cls.get_pds_url(application_url, pds.id) pds_url = cls.__get_url(pds_url_no_dbtype) logger.debug("Create PDS directory at %s" % pds_url) return pds_url_no_dbtype
def get_pds_url(cls, application_url, pds_id): pds_url = application_url + RedisCoordinationAdaptor.SEPARATOR + pds_id logger.debug("PDS URL: %s" % (pds_url)) return pds_url
def get_base_url(cls, application_id): surl = SAGAUrl(cls.BASE_URL) base_url = surl.scheme + "://" + surl.host + "/" + application_id + "/" logger.debug(base_url) return base_url