Beispiel #1
0
 def monitor_jobs(self):
     """Monitor running processes. """   
     #pdb.set_trace()
     logger.debug("Monitor jobs - # current jobs: %d"%len(self.jobs))
     for i in self.jobs:
         if self.processes.has_key(i): # only if job has already been starteds
             p = self.processes[i]
             p_state = p.poll()
             logger.debug(self.print_job(i) + " state: " + str(p_state) + " return code: " + str(p.returncode))
             if (p_state != None and (p_state==0 or p_state==255)):
                 logger.debug("Job successful: " + self.print_job(i))
                 self.coordination.set_job_state(i, str(bigjob.state.Done))
                 #i.set_attribute("state", str(saga.job.Done))
                 self.free_nodes(i)
                 del self.processes[i]
             elif p_state!=0 and p_state!=255 and p_state != None:
                 logger.debug(self.print_job(i) + " failed.  ")
                 # do not free nodes => very likely the job will fail on these nodes
                 # self.free_nodes(i)                    
                 #if self.restarted.has_key(i)==False:
                 #    logger.debug("Try to restart job " + self.print_job(i))
                 #    self.restarted[i]=True
                 #    self.execute_job(i)                        
                 #else:
                 logger.debug("Job failed " + self.print_job(i))                    
                 self.coordination.set_job_state(i, str(bigjob.state.Failed))
                 self.free_nodes(i)
                 del self.processes[i]
Beispiel #2
0
 def __get_launch_method(self, requested_method):
     """ returns desired execution method: ssh, aprun """
     
     aprun_available = False
     try:
         aprun_available = (subprocess.call("aprun -n 1 /bin/date", shell=True)==0)
     except:
         pass
     
     ssh_available = False
     try:
         ssh_available = (subprocess.call("ssh localhost /bin/date", shell=True)==0)
     except:
         pass
     
     launch_method = "ssh"
     if requested_method=="aprun" and aprun_available == True:
         launch_method="aprun"
     elif requested_method=="ssh" and ssh_available == True:
         launch_method="ssh"
     # aprun fallback
     elif ssh_available==False and aprun_available==True:
         launch_method="aprun"
     logger.debug("aprun: " + str(aprun_available) + " ssh: " + str(ssh_available) 
                  + " Launch method: " + str(launch_method))
     return launch_method
Beispiel #3
0
    def __init__(self, pilot_compute_service=None, 
                       bigjob_object=None, 
                       pilot_compute_description=None,
                       pilot_url=None): # for reconnecting
        
        """ Create/reconnect to a Pilot Compute.  

            Keyword arguments:
            pilot_url   -- restore from cp_id

            The implementation will attempt to reconnect to the PC instance
            referenced by the pilot_url.  

        """        

        self.__subjobs = []
        self.__pilot_compute_service = None
        if pilot_url==None:
            logger.debug("Create PilotCompute for BigJob: " + str(bigjob_object))
            self.pilot_compute_description=pilot_compute_description
            self.__pilot_compute_service=pilot_compute_service
            self.__bigjob = bigjob_object        
        else:
            logger.debug("Reconnect to an existing Pilot Compute")
            self.__bigjob = bigjob(pilot_url=pilot_url)
        
        # Store the URL of pilot compute service for later reference
        # This URL is used as central queue for a set of BJs in the
        # ComputeDataServiceDecentral
        if self.__pilot_compute_service!=None:
            self.coordination_queue = pilot_compute_service.coordination_queue
Beispiel #4
0
    def __initialize_pilot_data(self):

        if self.pilot_data_description != None:
            self.service_url = self.pilot_data_description["service_url"]
            self.size = self.pilot_data_description["size"]

            # initialize file adaptor
            if self.service_url.startswith("ssh:"):
                logger.debug("Use SSH backend")
                self.__filemanager = SSHFileAdaptor(self.service_url)
            elif self.service_url.startswith("http:"):
                logger.debug("Use WebHDFS backend")
                self.__filemanager = WebHDFSFileAdaptor(self.service_url)
            elif self.service_url.startswith("go:"):
                logger.debug("Use Globus Online backend")
                self.__filemanager = GSFileAdaptor(self.service_url)
            elif self.service_url.startswith("gs:"):
                logger.debug("Use Google Cloud Storage backend")
                self.__filemanager = GSFileAdaptor(self.service_url, self.security_context)
            elif (
                self.service_url.startswith("s3:")
                or self.service_url.startswith("walrus:")
                or self.service_url.startswith("swift:")
            ):
                logger.debug("Use Amazon S3/Eucalyptus Walrus/SWIFT Storage backend")
                self.__filemanager = S3FileAdaptor(self.service_url, self.security_context, self.pilot_data_description)
            else:
                raise PilotError("No File Plugin found.")

            self.__filemanager.initialize_pilotdata()
            self.__filemanager.get_pilotdata_size()

            # Update security context
            self.security_context = self.__filemanager.get_security_context()
Beispiel #5
0
 def run(self):
     jd = saga.job.description()
     jd.arguments = ["-c", self.bootstrap_script]
     jd.executable = "python"
     jd.working_directory =  self.working_directory
     jd.set_attribute("Interactive", "True")
     # Submit job
     js = None
     if self.userproxy != None and self.userproxy != '':
         s = saga.session()
         os.environ["X509_USER_PROXY"]=self.userproxy
         ctx = saga.context("x509")
         ctx.set_attribute ("UserProxy", self.userproxy)
         s.add_context(ctx)
         print "use proxy: " + self.userproxy
         js = saga.job.service(s, self.lrms_saga_url)
     else:
         print "use standard proxy"
         js = saga.job.service(self.lrms_saga_url)
     pbssshjob = js.create_job(jd)
     print "Submit pilot job to: " + str(self.lrms_saga_url)
     pbssshjob.run()
     pbssshjob.wait()
     outstr = pbssshjob.get_stdout().read()
     errstr = pbssshjob.get_stderr().read()
     self.job_id=(outstr).split(".")[0]
     logger.debug("PBS JobID: " + str(self.job_id))
     if self.job_id==None or self.job_id=="":
         raise Exception("BigJob submission via pbs-ssh:// failed: %s %s" % (outstr,errstr))
Beispiel #6
0
 def to_dict(self):
     pd_dict = {}
     pd_dict["id"]=self.id
     pd_dict["url"]=self.url
     pd_dict["pilot_data_description"]=self.pilot_data_description
     logger.debug("PS Dictionary: " + str(pd_dict))
     return pd_dict
Beispiel #7
0
 def __create_remote_directory(self, target_url):
     #result = urlparse.urlparse(target_url)
     #target_host = result.netloc
     #target_path = result.path
     
     # Python 2.6 compatible URL parsing
     scheme = target_url[:target_url.find("://")+3]
     target_host = target_url[len(scheme):target_url.find("/", len(scheme))]
     target_path = target_url[len(scheme)+len(target_host):]    
     target_user = None
     if target_host.find("@")>1:
         comp = target_host.split("@")
         target_host =comp[1]
         target_user =comp[0]
     logger.debug("Create remote directory; scheme: %s, host: %s, path: %s"%(scheme, target_host, target_path))
     if scheme.startswith("fork") or target_host.startswith("localhost"):
         os.makedirs(target_path)
         return True
     else:
         try:
             client = self.__get_ssh_client(target_host, target_user)
             sftp = client.open_sftp()            
             sftp.mkdir(target_path)
             sftp.close()
             client.close()
             return True
         except:
             self.__print_traceback()	
             logger.warn("Error creating directory: " + str(target_path) 
                          + " at: " + str(target_host) + " SSH password-less login activated?" )
             return False
Beispiel #8
0
 def cancel(self, pilot_url=None):
     logger.debug("delete job: " + self.job_url)
     if self.pilot_url==None:
         self.pilot_url = pilot_url
         self.bj=pilot_url_dict[pilot_url]  
     if str(self.bj.get_state())=="Running":
         self.bj.delete_subjob(self.job_url)        
Beispiel #9
0
 def create_data_unit_from_dict(cls, du_dict):
     du = DataUnitItem()
     logger.debug("Restore DU: " + str(du_dict))
     for i in du_dict.keys():
         logger.debug("Set attribute: %s", i)
         du.__setattr__(i, du_dict[i])
     return du
Beispiel #10
0
 def create_pilot_data_from_dict(cls, pd_dict):
     pd = PilotData()
     for i in pd_dict.keys():
         pd.__setattr__(i, pd_dict[i])
     pd.initialize_pilot_data()
     logger.debug("created pd " + str(pd))
     return pd
Beispiel #11
0
 def create_du(self, du_id):
     logger.debug("create iRods collection: " + du_id)
     if self.is_local:
         command = "mkdir %s"%(os.path.join(self.localpath, du_id))
     else:
         command = "imkdir %s"%(du_id)
     self.__run_command(command)
 def set_pilot_state(self, pilot_url, new_state, stopped=False):   
     pilot_url = self.get_url(pilot_url)
     logger.debug("create advert entry: " + pilot_url)
     pilot_dir = saga.advert.directory(saga.url(pilot_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite)
     logger.debug("update state of pilot job to: " + str(new_state) + " Stopped: " + str(stopped))
     pilot_dir.set_attribute("state", str(new_state)) 
     pilot_dir.set_attribute("stopped", str(stopped))
 def put_du(self, du):
     logging.debug("Copy DU using Globus Online")
     for i in du.list_data_unit_items():     
         remote_path = os.path.join(self.path, str(du.id), os.path.basename(i.local_url))
         logging.debug("Put file: %s to %s"%(i.local_url, remote_path))                        
         if i.local_url.startswith("ssh://"):
             # check if remote path is directory
             if self.__is_remote_directory(i.local_url):
                 logging.warning("Path %s is a directory. Ignored."%i.local_url)                
                 continue
             
            
             #self.__third_party_transfer(i.local_url, remote_path)                
         else:
             if stat.S_ISDIR(os.stat(i.local_url).st_mode):
                 logging.warning("Path %s is a directory. Ignored."%i.local_url)                
                 continue         
         result = urlparse.urlparse(i.local_url)
         source_host = result.netloc
         source_path = result.path
         logger.debug(str((source_host, source_path, self.host, remote_path)))
         if source_host == "" or source_host==None:
             cmd = "scp "+ source_path + " " + self.host + ":" + remote_path
         else:
             cmd = "scp "+ source_host+":"+source_path + " " + self.host + ":" + remote_path
         logger.debug("Command: %s"%cmd)
         os.system(cmd)                   
Beispiel #14
0
 def create_pilot_data_from_dict(cls, ps_dict):
     ps = PilotData()
     for i in ps_dict.keys():
         ps.__setattr__(i, ps_dict[i])
     ps.initialize_pilot_data()
     logger.debug("created ps " + str(ps))
     return ps
Beispiel #15
0
 def copy_du_to_url(self, du,  local_url, remote_url):
     base_dir = self.__get_path_for_du(du)
     logger.debug("copy_du_to_url, source: %s remote: %s"%(base_dir, remote_url))
     if remote_url.startswith("/") and os.path.exists(base_dir):
         target_path = remote_url
         source_path = base_dir
         logger.debug("Target and source host are localhost. Processing: %s" %(source_path))
         expanded_path = glob.glob(source_path + "/*")
         logger.debug("Expanded path: " + str(expanded_path))
         for path in expanded_path:
             if os.path.isdir(path):
                 logger.debug("Source path %s is directory"%path)
                 files = os.listdir(path)
                 for i in files:
                     try:
                         os.symlink(os.path.join(files, i), target_path)
                     except:
                         self.__print_traceback()
             else:
                 try:
                     os.symlink(path, os.path.join(target_path, os.path.basename(path)))
                 except:
                     self.__print_traceback()
     else:
         self.create_remote_directory(remote_url)  
         for filename in self.__sftp.listdir(base_dir):
             file_url = local_url + "/" + filename
             file_remote_url = remote_url + "/" + filename
             logger.debug("Copy " + file_url + " to " + file_remote_url)
             self.__third_party_transfer_host(file_url, file_remote_url)
Beispiel #16
0
 def wait(self):
     """ Waits for completion of all sub-jobs """        
     while 1:
         if self.get_state()=="Done" or self.get_state()=="Failed":
             logger.debug("BigJob terminated. Exit Wait")
             break
         
         jobs = self.coordination.get_jobs_of_pilot(self.pilot_url)
         finish_counter=0
         result_map = {}
         for i in jobs:
             # parse job id out of sj url
             surl = SAGAUrl(i)
             sj_id = surl.path
             if sj_id.startswith("/"): sj_id = sj_id[1:]
             state = str(self.coordination.get_job_state(sj_id))
             #logger.debug("SJ: %s : State: %s"%(sj_id, str(state)))   
             #state = job_detail["state"]                
             if result_map.has_key(state)==False:
                 result_map[state]=1
             else:
                 result_map[state] = result_map[state]+1
             if self.__has_finished(state)==True:
                 finish_counter = finish_counter + 1                   
         logger.debug("Total Jobs: %s States: %s"%(len(jobs), str(result_map)))
         if finish_counter == len(jobs):
             break
         time.sleep(2)
Beispiel #17
0
 def __run_command(self, command):
     logger.debug(command)
     child = pexpect.spawn(command, timeout=None)
     output = child.readlines()
     logger.debug("Run %s Output: %s"%(command, str(output)))
     child.close()
     return output 
 def schedule_pilot_data(self, data_unit_description=None):
     logger.debug("Schedule to PD - # Avail pilots: %d"%len(self.pilot_data))     
     candidate_pilot_data = []  
     if data_unit_description.has_key("affinity_datacenter_label") and data_unit_description.has_key("affinity_machine_label"):
         for i in self.pilot_data: 
             pilot_data_description = i.pilot_data_description
             if pilot_data_description.has_key("affinity_machine_label") and pilot_data_description.has_key("affinity_datacenter_label"):
                 if data_unit_description["affinity_datacenter_label"] == pilot_data_description["affinity_datacenter_label"]\
                 and data_unit_description["affinity_machine_label"] == pilot_data_description["affinity_machine_label"]:
                     candidate_pilot_data.append(i)
     
     if len(candidate_pilot_data) == 0:
         # No PD with requested affinity found
         # move data unit into a "possibly" remote pilot data
         logger.debug("No pilot data w/ affinity found... Looking for alternative pilot.")
         candidate_pilot_data = self.pilot_data
         
     if len(candidate_pilot_data)>0:
         return random.choice(candidate_pilot_data)
     
     return None
     
     #if len(self.pilot_data)!=0:
     #    return random.choice(self.pilot_data)
     return None
 def set_job_state(self, job_url, new_state):
     self.resource_lock.acquire()        
     try:
         logger.debug("set job state to: " + str(new_state))
         timestamp =time.time() 
         if new_state=="Unknown":
             #self.redis_client.hset(job_url,"start_time", str(timestamp))
             self.pipe.hset(job_url,"start_time", str(timestamp))
         elif new_state=="Staging":
             self.pipe.hset(job_url,"start_staging_time", str(timestamp))
         elif new_state=="Running":
             self.pipe.hset(job_url,"agent_start_time", str(self.redis_adaptor_start_time))
             self.pipe.hset(job_url,"end_queue_time", str(timestamp))
         elif new_state=="Done":
             self.pipe.hset(job_url, "run_host", socket.gethostname())
             self.pipe.hset(job_url, "end_time", str(timestamp))       
         self.pipe.hset(job_url, "state", str(new_state))
         
         # update last contact time in pilot hash        
         pilot_url = job_url[:job_url.index(":jobs")]
         self.pipe.hset(pilot_url, "last_contact", str(timestamp))
         # execute pipe
         self.pipe.execute()
     except:
         pass
     self.resource_lock.release()
Beispiel #20
0
 def put_du(self, du):
     """Copy Data Unit to Pilot Data"""
     logger.debug("Put DU: %s to Pilot-Data: %s"%(du.id,self.service_url))
     self.__filemanager.create_du(du.id)
     self.__filemanager.put_du(du)
     self.data_unit_urls.append(du.get_url())
     CoordinationAdaptor.update_pd(self)
Beispiel #21
0
 def dequeue_new_jobs(self):	    
     """Subscribe to new jobs from Redis. """ 
     job_counter = 0               
     while self.is_stopped(self.base_url)==False:     
         if len(self.freenodes)==0:
             time.sleep(3)
             continue
         logger.debug("Dequeue sub-job from PilotCompute queue: " + self.base_url)       
         job_url=self.coordination.dequeue_job(self.base_url)
         logger.debug("Dequed:%s"%str(job_url))
         if job_url==None:
             if self.cds_queue_url!=None:
                 logger.debug("Dequeue sub-job from ComputeDataServicequeue: " + self.cds_queue_url)       
                 job_url=self.coordination.dequeue_job(self.cds_queue_url)
                 logger.debug("Dequed:%s"%str(job_url))
             if job_url==None:
                 time.sleep(3)
                 continue
         if job_url=="STOP":
             break
         
         job_counter = job_counter + 1            
         if (job_counter % (THREAD_POOL_SIZE))==0: # ensure that threadpool is not too overloaded
             self.threadpool.wait()
         
         request = WorkRequest(self.start_new_job_in_thread, [job_url])
         self.threadpool.putRequest(request)
         #time.sleep(1)
         
     # wait for termination of Worker Threads
     # self.threadpool.wait()   
     logger.debug("Terminating Agent - Dequeue Sub-Jobs Thread")   
Beispiel #22
0
 def __store_entry(cls, entry_url, content):
     entry_url = cls.__get_url(entry_url)
     redis_client = cls.__get_redis_api_client()
     redis_client.hmset(entry_url, content)
     
     logger.debug("Store Redis entry at: " + entry_url 
                   + " Content: " + str(json.dumps(content)))
Beispiel #23
0
    def __init__(self, pilot_data=None, data_unit_description=None, du_url=None):
        """
            1.) create a new Pilot Data: pilot_data_service and data_unit_description required
            2.) reconnect to an existing Pilot Data: du_url required 
            
        """
        if du_url==None:
            self.id = self.DU_ID_PREFIX + str(uuid.uuid1())
            self.data_unit_description = data_unit_description        
            self.pilot_data=[]
            self.state = State.New
            self.data_unit_items=[]
            if self.data_unit_description.has_key("file_urls"):
                self.data_unit_items = DataUnitItem.create_data_unit_list(self, self.data_unit_description["file_urls"]) 

            self.url = None

            # register a data unit as top-level entry in Redis
            application_url = CoordinationAdaptor.get_base_url(application_id)
            self.url = CoordinationAdaptor.add_du(application_url, self)
            CoordinationAdaptor.update_du(self)
            
            # Deprecated
            # old method only allowed the creation of a du if a pd existed
            #if pilot_data!=None:
            #    # Allow data units that are not connected to a resource!
            #    self.url = CoordinationAdaptor.add_du(pilot_data.url, self)
            #    CoordinationAdaptor.update_du(self)
        else:
            self.id = DataUnit._get_du_id(du_url)
            self.url = du_url   
            logger.debug("Restore du: %s"%self.id)         
            self.__restore_state()
            
        self.transfer_threads=[]
Beispiel #24
0
    def _scheduler_thread(self):
        while True and self.stop.isSet()==False:            
            try:
                #logger.debug("Scheduler Thread: " + str(self.__class__) + " Pilot Data")
                du = self.du_queue.get(True, 1)  
                # check whether this is a real du object  
                if isinstance(du, DataUnit):
                    pd=self._schedule_du(du)                
                    if(pd!=None):                        
                        logger.debug("Initiate Transfer to PD.")
                        du.add_pilot_data(pd)
                        #logger.debug("Transfer to PD finished.")
                        #du._update_state(State.Running) 
                        #logger.debug("Updated State to Running.")
                        self.du_queue.task_done()                   
                    else:
                        self.du_queue.task_done() 
                        self.du_queue.put(du)
            except Queue.Empty:
                pass
               
            if self.du_queue.empty():
                time.sleep(5)        

        logger.debug("Re-Scheduler terminated")
Beispiel #25
0
    def __init__(self, cds_url=None):
        """ Create a ComputeDataService (Decentral) object.

            @param cds_url: Reconnect to an existing CDS (optional).
        """
        # Pilot Data
        self.data_units={}
        self.pilot_data_services=[]
        
        # Pilot Compute
        self.compute_units={}
        self.pilot_job_services=[]
            
        if cds_url == None:
            self.id=self.CDS_ID_PREFIX + str(uuid.uuid1())
            application_url = CoordinationAdaptor.get_base_url(pilot.application_id)
            self.url = CoordinationAdaptor.add_cds(application_url, self)            
        else:
            self.id = self.__get_cds_id(cds_url)
            self.url = cds_url
           
        # Background Thread for scheduling
        self.scheduler = Scheduler()
        self.du_queue = Queue.Queue()
        
        self.stop=threading.Event()
        self.scheduler_thread=threading.Thread(target=self._scheduler_thread)
        self.scheduler_thread.daemon=True
        self.scheduler_thread.start()
        logger.debug("Created ComputeDataServiceDecentral")
Beispiel #26
0
 def __update_scheduler_resources(self):
     logger.debug("__update_scheduler_resources")        
     pd = [s for i in self.pilot_data_services for s in i.list_pilots()]
     self.scheduler.set_pilot_data(pd)
     pj = [p for i in self.pilot_job_services for p in i.list_pilots()]
     logger.debug("Pilot-Jobs: " + str(pj))
     self.scheduler.set_pilot_jobs(pj)
Beispiel #27
0
 def __escape_bliss(self, bootstrap_script):
     logger.debug("Escape Bliss")
     #bootstrap_script = bootstrap_script.replace("\'", "\"")
     #bootstrap_script = "\'" + bootstrap_script+ "\'"
     bootstrap_script = bootstrap_script.replace('"','\\"')
     bootstrap_script = '"' + bootstrap_script+ '"'
     return bootstrap_script
Beispiel #28
0
    def start_background_thread(self):
        self.stop = False
        logger.debug("##################################### New POLL/MONITOR cycle ##################################")
        while True and self.stop == False:
            logger.debug(
                "Free nodes: "
                + str(len(self.freenodes))
                + " Busy Nodes: "
                + str(len(self.busynodes))
                + " Number of running sub-jobs: "
                + str(len(self.jobs))
            )
            if self.is_stopped(self.base_url) == True:
                logger.debug("Pilot terminated.")
                break
            else:
                logger.debug("Pilot job entry: " + str(self.base_url) + " exists. Pilot job not in state stopped.")
            try:
                # self.poll_jobs()
                self.monitor_jobs()
                time.sleep(5)
                self.failed_polls = 0
            except:
                traceback.print_exc(file=sys.stdout)
                self.failed_polls = self.failed_polls + 1
                if self.failed_polls > 3:  # after 3 failed attempts exit
                    break

        logger.debug("Terminating Agent - Background Thread")
Beispiel #29
0
    def __get_launch_method(self, requested_method):
        """ returns desired execution method: ssh, aprun """
        
        aprun_available = False
        try:
            aprun_available = (subprocess.call("aprun -n 1 /bin/date", shell=True, stdout=None, stderr=None)==0)
        except:
            self.__print_traceback()

        ibrun_available = self.__ibrun_available()
        
        ssh_available = False
        try:
            ssh_available = (subprocess.call("ssh -o PasswordAuthentication=no -o NumberOfPasswordPrompts=0 localhost /bin/date", shell=True, stdout=None, stderr=None)==0)
        except:
            pass
        
        launch_method = "local"
        if requested_method=="aprun" and aprun_available == True:
            launch_method="aprun"
        elif ibrun_available == True:
            launch_method="ibrun"
        elif requested_method=="ssh" and ssh_available == True:
            launch_method="ssh"
        # aprun fallback
        elif ssh_available==False and aprun_available==True:
            launch_method="aprun"


        logger.debug("aprun: " + str(aprun_available) + " ibrun: " + str(ibrun_available) 
                     + " ssh: " + str(ssh_available) 
                     + " Launch method: " + str(launch_method))
        return launch_method
 def set_job_state(self, job_url, new_state):   
     self.resource_lock.acquire()     
     job_url = self.get_url(job_url)
     logger.debug("Set state of job: " + str(job_url) + " to: " + str(new_state))
     job_dir = saga.advert.directory(saga.url(job_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite)
     job_dir.set_attribute("state", str(new_state))
     self.resource_lock.release()
Beispiel #31
0
 def list_pd(cls, pds_url):
     """ return a list of urls to pd managed by the PDS """
     pds_url = cls.__get_url(pds_url)
     logger.debug("List PD at %s" % pds_url)
    def __initialize_pilot_data(self):

        if self.pilot_data_description != None:
            self.service_url = self.pilot_data_description["service_url"]
            if self.pilot_data_description.has_key("size"):
                self.size = self.pilot_data_description["size"]

            # initialize file adaptor
            if self.service_url.startswith("ssh:"):
                logger.debug("Use SSH backend")
                self.__filemanager = SSHFileAdaptor(
                    self.service_url, self.security_context,
                    self.pilot_data_description)
            elif self.service_url.startswith("http:"):
                logger.debug("Use WebHDFS backend")
                self.__filemanager = WebHDFSFileAdaptor(self.service_url)
            elif self.service_url.startswith("go:"):
                logger.debug("Use Globus Online backend")
                self.__filemanager = GlobusOnlineFileAdaptor(self.service_url)
            elif self.service_url.startswith("gs:"):
                logger.debug("Use Google Cloud Storage backend")
                self.__filemanager = GSFileAdaptor(self.service_url,
                                                   self.security_context)
            elif self.service_url.startswith("irods:"):
                logger.debug("Use iRods Storage backend")
                self.__filemanager = iRodsFileAdaptor(self.service_url,
                                                      self.security_context)
            elif self.service_url.startswith("s3:") \
                or self.service_url.startswith("walrus:") \
                or self.service_url.startswith("swift:"):
                logger.debug(
                    "Use Amazon S3/Eucalyptus Walrus/SWIFT Storage backend")
                self.__filemanager = S3FileAdaptor(self.service_url,
                                                   self.security_context,
                                                   self.pilot_data_description)
            else:
                raise PilotError("No File Plugin found.")

            self.__filemanager.initialize_pilotdata()
            self.__filemanager.get_pilotdata_size()

            # Update security context
            self.security_context = self.__filemanager.get_security_context()
Beispiel #33
0
    def __init__(self, args):

        self.coordination_url = args[1]
        # objects to store running jobs and processes
        self.jobs = []
        self.processes = {}
        self.freenodes = []
        self.busynodes = []
        self.restarted = {}

        # read config file
        conf_file = os.path.dirname(
            os.path.abspath(__file__)) + "/../" + CONFIG_FILE
        if not os.path.exists(conf_file):
            conf_file = os.path.join(sys.prefix, CONFIG_FILE)
        logging.debug("read configfile: " + conf_file)
        config = ConfigParser.ConfigParser()
        config.read(conf_file)
        default_dict = config.defaults()
        self.CPR = False
        if default_dict.has_key("cpr"):
            self.CPR = default_dict["cpr"]
        self.SHELL = "/bin/bash"
        if default_dict.has_key("shell"):
            self.SHELL = default_dict["shell"]
        self.MPIRUN = "mpirun"
        if default_dict.has_key("mpirun"):
            self.MPIRUN = default_dict["mpirun"]
        self.OUTPUT_TAR = False
        if default_dict.has_key("create_output_tar"):
            self.OUTPUT_TAR = eval(default_dict["create_output_tar"])
            logger.debug("Create output tar: %r", self.OUTPUT_TAR)

        self.LAUNCH_METHOD = "ssh"
        if default_dict.has_key("launch_method"):
            self.LAUNCH_METHOD = self.__get_launch_method(
                default_dict["launch_method"])

        logging.debug("Launch Method: " + self.LAUNCH_METHOD + " mpi: " +
                      self.MPIRUN + " shell: " + self.SHELL)

        # init rms (SGE/PBS)
        self.init_rms()
        self.failed_polls = 0

        ##############################################################################
        # initialization of coordination and communication subsystem
        # Redis initialization
        self.base_url = args[2]
        self.cds_queue_url = None
        if len(args) == 4:
            self.cds_queue_url = args[3]
        logger.debug("External queue: " + str(self.cds_queue_url))
        self.id = self.__get_bj_id(self.base_url)
        logger.debug("BigJob Agent arguments: " + str(args))
        logger.debug("Initialize C&C subsystem to pilot-url: " + self.base_url)
        logger.debug("BigJob ID: %s" % self.id)

        # create bj directory
        self.work_dir = os.getcwd()
        if self.work_dir.find(
                self.id) == -1:  # working directory already contains BJ id
            self.bj_dir = os.path.join(os.getcwd(), self.id)
            logger.debug("Agent working directory: %s" % self.bj_dir)
            try:
                os.makedirs(self.bj_dir)
            except:
                logger.debug("Directory already exists.")
        else:
            self.bj_dir = os.getcwd()

        os.chdir(self.bj_dir)

        if (self.coordination_url.startswith("advert://")
                or self.coordination_url.startswith("sqlasyncadvert://")):
            try:
                from coordination.bigjob_coordination_advert import bigjob_coordination
                logging.debug("Utilizing ADVERT Backend: " +
                              self.coordination_url)
            except:
                logger.error("Advert Backend could not be loaded")
                exc_type, exc_value, exc_traceback = sys.exc_info()
                traceback.print_exc(file=sys.stderr)
                traceback.print_tb(exc_traceback, file=sys.stderr)
        elif (self.coordination_url.startswith("redis://")):
            try:
                from coordination.bigjob_coordination_redis import bigjob_coordination
                logger.debug(
                    "Utilizing Redis Backend: " + self.coordination_url +
                    ". Please make sure Redis server is configured in bigjob_coordination_redis.py"
                )
            except:
                logger.error("Error loading pyredis.")
        elif (self.coordination_url.startswith("tcp://")):
            try:
                from coordination.bigjob_coordination_zmq import bigjob_coordination
                logger.debug("Utilizing ZMQ Backend")
            except:
                logger.error(
                    "ZMQ Backend not found. Please install ZeroMQ (http://www.zeromq.org/intro:get-the-software) and "
                    + "PYZMQ (http://zeromq.github.com/pyzmq/)")

        ###
        # Initiate coordination sub-system of both BJ agent and Pilot Data
        self.coordination = bigjob_coordination(
            server_connect_url=self.coordination_url)
        self.pilot_data_service = PilotDataService(
            coordination_url=self.coordination_url)

        # update state of pilot job to running
        logger.debug("set state to : " + str(bigjob.state.Running))
        self.coordination.set_pilot_state(self.base_url,
                                          str(bigjob.state.Running), False)
        self.pilot_description = self.coordination.get_pilot_description(
            self.base_url)

        ##############################################################################
        # start background thread for polling new jobs and monitoring current jobs
        self.resource_lock = threading.RLock()
        self.threadpool = ThreadPool(THREAD_POOL_SIZE)

        self.launcher_thread = threading.Thread(target=self.dequeue_new_jobs)
        self.launcher_thread.start()

        self.monitoring_thread = threading.Thread(
            target=self.start_background_thread)
        self.monitoring_thread.start()
Beispiel #34
0
 def __get_bj_id(self, url):
     logger.debug("parsing ID out of URL: %s" % url)
     start = url.index("bj-")
     end = url.index(":", start)
     bj_id = url[start:end]
     return bj_id
Beispiel #35
0
 def update_du_state(cls, du, state):
     logger.debug("**** Update data unit STATE at: " + du.url + " to: " +
                  str(state))
     cls.__store_entry_item(
         du.url + RedisCoordinationAdaptor.SEPARATOR + "info", "state",
         state)
Beispiel #36
0
    def run(self):
        """ Start VM and start BJ agent via SSH on VM """
        
        """ Map fields of Pilot description to EC2 API
            { "vm_id":"ami-d7f742be",
              "vm_ssh_username":"******",
              "vm_ssh_keyname":"MyKey",
              "vm_ssh_keyfile":"<path>",
              "vm_type":"t1.micro",
              "access_key_id":"xxx",
              "secret_access_key":"xxx"
            }
        """    
            
        reservation = self.ec2_conn.run_instances(self.pilot_compute_description["vm_id"],
                                    key_name=self.pilot_compute_description["vm_ssh_keyname"],
                                    instance_type=self.pilot_compute_description["vm_type"],
                                    security_groups=[SECURITY_GROUP])
                
        self.instance = reservation.instances[0]
        self.instance_id = self.instance.id
        logger.debug("Started EC2/Eucalyptus/Nova instance: %s"%self.instance_id)
        time.sleep(5)
        self.wait_for_running()
        
        if self.resource_url.scheme != "euca+ssh" and self.resource_url.scheme != "nova+ssh":
            self.ec2_conn.create_tags([self.instance_id], {"Name": self.id})
      
        
        self.network_ip = self.instance.ip_address 
        url = "ssh://" + str(self.network_ip)
        logger.debug("Connect to: %s"%(url))

        
        # Submit job
        ctx = saga.Context("SSH")
        #ctx.type = saga.Context.SSH
        ctx.user_id = self.pilot_compute_description["vm_ssh_username"]
        ctx.user_key = self.pilot_compute_description["vm_ssh_keyfile"]

        session = saga.Session()
        session.add_context(ctx)
                
        TRIAL_MAX=30
        trials=0
        while trials < TRIAL_MAX:
            try:
                js = saga.job.Service(url, session=session)
                logger.debug("Job Description Type: " + str(type(self.job_description)))
                job = js.create_job(self.job_description)
                logger.debug("Attempt: %d, submit pilot job to: %s "%(trials,str(url)))
                job.run()
                if job.get_state()==saga.job.FAILED:
                    logger.warning("Submission failed.")
                    trials = trials + 1 
                    time.sleep(30)
                    continue
                else:
                    break
            except:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                logger.warning("Submission failed: " + str(exc_value))
                #self.__print_traceback()
                trials = trials + 1 
                time.sleep(30)
                if trials == TRIAL_MAX:
                    raise Exception("Submission of agent failed.") 
                
        logger.debug("Job State : %s" % (job.get_state())) 
Beispiel #37
0
logging.basicConfig(level=logging.DEBUG)

try:
    import saga
except:
    logging.debug("SAGA not imported. ")

logging.debug(str(sys.path))
from threadpool import *

# BigJob/Pilot framework classes
from bigjob import logger
from pilot.impl.pilotdata_manager import PilotData, DataUnit, PilotDataService

logger.debug("Python Version: " + str(sys.version_info))
if sys.version_info < (2, 5):
    sys.stderr.write("Warning: Using unsupported Python version\n")
if sys.version_info < (2, 4):
    sys.stderr.write("Warning: Using unsupported Python version\n")
if sys.version_info < (2, 3):
    sys.stderr.write("Warning: Python versions <2.3 not supported\n")
    sys.exit(-1)

import subprocess
""" Config parameters (will move to config file in future) """
CONFIG_FILE = "bigjob_agent.conf"
THREAD_POOL_SIZE = 4
APPLICATION_NAME = "bigjob"

Beispiel #38
0
 def list_du(cls, pd_url):
     """ return a list of urls to du managed by the PDS """
     pd_url = cls.__get_url(pd_url)
     logger.debug("List Data-Units of Pilot-Data at %s" % pd_url)
     dus = cls.__list_keys(pd_url + ":du-*")
     return dus
Beispiel #39
0
    def execute_job(self, job_url, job_dict):
        """ obtain job attributes from c&c and execute process """
        state = str(job_dict["state"])

        if (state == str(bigjob.state.Unknown)
                or state == str(bigjob.state.New)):
            try:
                #job_dict["state"]=str(saga.job.New)
                job_id = job_dict["job-id"]
                logger.debug("Start job id %s specification %s: " %
                             (job_id, str(job_dict)))
                numberofprocesses = "1"
                try:
                    if (job_dict.has_key("NumberOfProcesses") == True):
                        numberofprocesses = job_dict["NumberOfProcesses"]
                except:
                    pass  # ignore in particular if Bliss is used

                spmdvariation = "single"
                try:
                    if (job_dict.has_key("SPMDVariation") == True):
                        spmdvariation = job_dict["SPMDVariation"]
                except:
                    pass  # ignore in particular if Bliss is used

                arguments = ""
                if (job_dict.has_key("Arguments") == True):
                    arguments_raw = job_dict['Arguments']
                    if type(arguments_raw) == types.ListType:
                        arguments_list = arguments_raw
                    else:
                        arguments_list = eval(job_dict["Arguments"])
                    for i in arguments_list:
                        arguments = arguments + " " + i

                environment = os.environ
                envi = ""
                self.number_subjobs = 1
                if (job_dict.has_key("Environment") == True):
                    env_raw = job_dict['Environment']
                    if type(env_raw) == types.ListType:
                        env_list = env_raw
                    else:
                        env_list = eval(job_dict["Environment"])

                    logger.debug("Environment: " + str(env_list))
                    for i in env_list:
                        logger.debug("Eval " + i)
                        # Hack for conduction experiments on Kraken
                        # Kraken specific support for running n sub-jobs at a time
                        if i.startswith("NUMBER_SUBJOBS"):
                            self.number_subjobs = int(i.split("=")[1].strip())
                            logger.debug("NUMBER_SUBJOBS: " +
                                         str(self.number_subjobs))
                        else:
                            envi_1 = "export " + i + "; "
                            envi = envi + envi_1
                            logger.debug(envi)

                executable = job_dict["Executable"]
                executable = self.__expand_directory(executable)

                workingdirectory = os.path.join(os.getcwd(), job_id)
                if (job_dict.has_key("WorkingDirectory") == True):
                    workingdirectory = job_dict["WorkingDirectory"]
                    workingdirectory = self.__expand_directory(
                        workingdirectory)
                try:
                    os.makedirs(workingdirectory)
                except:
                    logger.debug("Directory %s already exists." %
                                 workingdirectory)
                logging.debug("Sub-Job: %s, Working_directory: %s" %
                              (job_id, workingdirectory))

                output = "stdout"
                if (job_dict.has_key("Output") == True):
                    output = job_dict["Output"]
                if not os.path.isabs(output):
                    output = os.path.join(workingdirectory, output)

                error = os.path.join(workingdirectory, "stderr")
                if (job_dict.has_key("Error") == True):
                    error = job_dict["Error"]
                if not os.path.isabs(error):
                    error = os.path.join(workingdirectory, error)

                # append job to job list
                self.jobs.append(job_url)

                # File Stage-In of dependent data units
                if job_dict.has_key("InputData"):
                    self.__stage_in_data_units(eval(job_dict["InputData"]),
                                               workingdirectory)

                # File Stage-In - Move pilot-level files to working directory of sub-job
                if self.pilot_description != None:
                    try:
                        if self.pilot_description.has_key("description"):
                            file_list = eval(
                                self.pilot_description["description"])
                            if file_list != None and len(file_list) > 0:
                                logger.debug("Copy %d files to SJ work dir" %
                                             len(file_list) > 0)
                                for i in file_list:
                                    logger.debug("Process file: %s" % i)
                                    if i.find(">") > 0:
                                        base_filename = os.path.basename(
                                            i[:i.index(">")].strip())
                                        if environment.has_key(
                                                "_CONDOR_SCRATCH_DIR"):
                                            source_filename = os.path.join(
                                                environment[
                                                    "_CONDOR_SCRATCH_DIR"],
                                                base_filename)
                                        else:
                                            source_filename = os.path.join(
                                                self.work_dir, base_filename)
                                        target_filename = os.path.join(
                                            workingdirectory, base_filename)
                                        try:
                                            logger.debug("Copy: %s to %s" %
                                                         (source_filename,
                                                          target_filename))
                                            shutil.copyfile(
                                                source_filename,
                                                target_filename)
                                        except:
                                            logger.error(
                                                "Error copy: %s to %s" %
                                                (source_filename,
                                                 target_filename))
                    except:
                        logger.debug("Moving of stage-in files failed.")

                # create stdout/stderr file descriptors
                output_file = os.path.abspath(output)
                error_file = os.path.abspath(error)
                logger.debug("stdout: " + output_file + " stderr: " +
                             error_file)
                stdout = open(output_file, "w")
                stderr = open(error_file, "w")
                if self.LAUNCH_METHOD == "aprun":
                    if (spmdvariation.lower() == "mpi"):
                        command = envi + "aprun  -n " + str(
                            numberofprocesses
                        ) + " " + executable + " " + arguments
                    else:
                        #env_strip = envi.strip()
                        #env_command = env_strip[:(len(env_strip)-1)]
                        command = envi + "aprun  -n " + str(
                            self.number_subjobs
                        ) + " -d " + numberofprocesses + " " + executable + " " + arguments

                    # MPMD Mode => all subjobs on Kraken fail because aprun returns 1 as returncode
                    #command = "aprun"
                    #for i in range(0, self.number_subjobs):
                    #    command = command +   " -d " + numberofprocesses + " " + executable + " " + arguments
                    #    # + " 1 > "+ str(i)+ "-out.txt " + " 2 > "+ str(i)+ "-err.txt"
                    #    if i != self.number_subjobs-1:
                    #        command = command + " : "
                elif (spmdvariation.lower() != "mpi"):
                    command = envi + executable + " " + arguments
                    # In particular for Condor - if executable is staged x flag is not set
                    #command ="chmod +x " + executable +";export PATH=$PATH:" + workingdirectory + ";" +command
                else:
                    # Environment variables need to be handled later!
                    command = envi + executable + " " + arguments

                # special setup for MPI NAMD jobs
                machinefile = self.allocate_nodes(job_dict)
                host = "localhost"
                try:
                    machine_file_handler = open(machinefile, "r")
                    node = machine_file_handler.readlines()
                    machine_file_handler.close()
                    host = node[0].strip()
                except:
                    pass

                if (machinefile == None):
                    logger.debug("Not enough resources to run: " + job_url)
                    self.coordination.queue_job(self.base_url, job_url)
                    return  # job cannot be run at the moment

                # build execution command
                if self.LAUNCH_METHOD == "aprun":
                    command = "cd " + workingdirectory + "; " + command
                elif self.LAUNCH_METHOD == "local":
                    command = "cd " + workingdirectory + "; " + command
                else:  # ssh launch is default
                    if (spmdvariation.lower() == "mpi"):
                        command = "cd " + workingdirectory + "; " + envi + self.MPIRUN + " -np " + numberofprocesses + " -machinefile " + machinefile + " " + command
                    elif host == "localhost":
                        command = "cd " + workingdirectory + "; " + command
                    else:
                        command = "ssh  " + host + " \'cd " + workingdirectory + "; " + command + "\'"

                # start application process
                shell = self.SHELL
                logger.debug("execute: " + command + " in " +
                             workingdirectory + " from: " +
                             str(socket.gethostname()) + " (Shell: " + shell +
                             ")")
                # bash works fine for launching on QB but fails for Abe :-(
                p = subprocess.Popen(args=command,
                                     executable=shell,
                                     stderr=stderr,
                                     stdout=stdout,
                                     cwd=workingdirectory,
                                     env=environment,
                                     shell=True)
                logger.debug("started " + command)
                self.processes[job_url] = p
                self.coordination.set_job_state(job_url,
                                                str(bigjob.state.Running))
            except:
                traceback.print_exc(file=sys.stderr)
    # create empty data unit for output data
    output_data_unit_description = {"file_urls": []}
    output_data_unit = pd.submit_data_unit(output_data_unit_description)
    output_data_unit.wait()

    # create compute unit
    compute_unit_description = {
        "executable": "/bin/cat",
        "arguments": ["test.txt"],
        "number_of_processes": 1,
        "output": "stdout.txt",
        "error": "stderr.txt",
        "input_data": [input_data_unit.get_url()],
        # Put files stdout.txt and stderr.txt into output data unit
        "output_data": [{
            output_data_unit.get_url(): ["std*"]
        }]
    }

    compute_unit = compute_data_service.submit_compute_unit(
        compute_unit_description)
    logger.info(
        "Finished setup of ComputeDataService. Waiting for scheduling of PD")
    compute_data_service.wait()

    logger.debug("Output Data Unit: " + str(output_data_unit.list()))

    logger.info("Terminate Pilot Compute/Data Service")
    compute_data_service.cancel()
    pilot_data_service.cancel()
    pilot_compute_service.cancel()
Beispiel #41
0
 def print_machine_file(self, filename):
     fh = open(filename, "r")
     lines = fh.readlines()
     fh.close
     logger.debug("Machinefile: " + filename + " Hosts: " + str(lines))
def test_du_reconnect():
    du_url = "redis://localhost/bigdata:du-1d1b7078-229f-11e2-834e-705681b3df0f"
    du = DataUnit(du_url=du_url)
    logger.debug(str(du.list()))
    du.export("/tmp/export-test")
Beispiel #43
0
 def put_progress(self, transfered_bytes, total_bytes):
     logger.debug("Bytes transfered %d/%d" %
                  (transfered_bytes, total_bytes))
Beispiel #44
0
 def __print_traceback(self):
     exc_type, exc_value, exc_traceback = sys.exc_info()
     logger.debug("*** print_exception:",
                  exc_info=(exc_type, exc_value, exc_traceback))
Beispiel #45
0
 def __escape_ssh(self, bootstrap_script):
     logger.debug("Escape SSH")
     bootstrap_script = bootstrap_script.replace("\"", "\\\"")
     bootstrap_script = bootstrap_script.replace("\'", "\\\"")
     bootstrap_script = "\"" + bootstrap_script + "\""
     return bootstrap_script
Beispiel #46
0
 def get_pd(cls, pd_url):
     logger.debug("GET PD: " + pd_url)
     pd_dict = cls.__retrieve_entry(pd_url +
                                    RedisCoordinationAdaptor.SEPARATOR +
                                    "info")
     return pd_dict
Beispiel #47
0
 def __initialize_pilot_data(self, service_url):
     # initialize file adaptor
     # Pilot Data API for File Management
     if service_url.startswith("ssh:"):
         logger.debug("Use SSH backend for PilotData")
         try:
             from pilot.filemanagement.ssh_adaptor import SSHFileAdaptor
             self.__filemanager = SSHFileAdaptor(service_url)
         except:
             logger.debug("SSH package not found.")
             self.__print_traceback()
     elif service_url.startswith("http:"):
         logger.debug("Use WebHDFS backend")
         try:
             from pilot.filemanagement.webhdfs_adaptor import WebHDFSFileAdaptor
             self.__filemanager = WebHDFSFileAdaptor(service_url)
         except:
             logger.debug("WebHDFS package not found.")
     elif service_url.startswith("go:"):
         logger.debug("Use Globus Online backend")
         try:
             from pilot.filemanagement.globusonline_adaptor import GlobusOnlineFileAdaptor
             self.__filemanager = GlobusOnlineFileAdaptor(service_url)
         except:
             logger.debug("Globus Online package not found.")
             self.__print_traceback()
 def export_du(self, du, target_url):
     """ Export Data Unit to a local directory """
     if target_url.startswith("/") and os.path.exists(target_url) == False:
         os.mkdir(target_url)
     logger.debug("Export Data-Unit to %s" % target_url)
     self.__filemanager.get_du(du, target_url)
Beispiel #49
0
 def __escape_rsl(self, bootstrap_script):
     logger.debug("Escape RSL")
     bootstrap_script = bootstrap_script.replace("\"", "\"\"")
     return bootstrap_script
Beispiel #50
0
    def cancel(self):
        """ duck typing for cancel of saga.cpr.job and saga.job.job  """
        logger.debug("Cancel Pilot Job")
        try:
            self.job.cancel()
        except:
            pass
            #traceback.print_stack()

        logger.debug("Cancel Job Service")
        try:
            if not self._ocache.rem_obj(self.js):
                logger.debug("Cancel Job Service")
                del (self.js)
            else:
                logger.debug("Cancel Job Service done")

            self.js = None
        except:
            pass
            #traceback.print_stack()

        try:
            self._stop_pilot_job()
            logger.debug("delete pilot job: " + str(self.pilot_url))
            if _CLEANUP:
                self.coordination.delete_pilot(self.pilot_url)
            #os.remove(os.path.join("/tmp", "bootstrap-"+str(self.uuid)))
        except:
            pass
            #traceback.print_stack()
        logger.debug("Cancel Pilot Job finished")
Beispiel #51
0
 def _get_subjob_state(self, job_url):
     logger.debug("Get subjob state: " + str(job_url))
     return self.coordination.get_job_state(job_url)
Beispiel #52
0
 def __escape_pbs(self, bootstrap_script):
     logger.debug("Escape bootstrap script")
     bootstrap_script = "\'" + bootstrap_script + "\'"
     return bootstrap_script
Encapsulates coordination and communication specifics of bigjob
'''
import threading
import datetime
import time
import sys
import os
import pickle
import pdb
import saga
import json
import urlparse
import logging

from bigjob import logger
logger.debug("Load Advert Coordination")

if sys.version_info < (2, 5):
    sys.path.append(
        os.path.dirname(os.path.abspath(__file__)) + "/../ext/uuid-1.30/")
    sys.stderr.write("Warning: Using unsupported Python version\n")

logging.debug(str(sys.path))
import uuid

APPLICATION_NAME = "BigJob/BigJob"
ADVERT_URL_SCHEME = "advert://"
ADVERT_SERVER = "advert.cct.lsu.edu"
ADVERT_SERVER_PORT = 8080

Beispiel #54
0
 def get_pd(cls, pds_url):
     logger.debug("GET PD: " + pds_url)     
     pd_dict={}        
Beispiel #55
0
    def get_du(self, du, target_url):
        #du_id = "du-7370d7b5-ed0b-11e1-95df-705681b3df0f"
        start = time.time()
        du_id = du.id
        logger.debug("Get DU: " + str(du_id))
        if self.is_local:
            command = "cp -r %s %s" % (os.path.join(self.localpath,
                                                    du_id), target_url)
            source_path = os.path.join(self.localpath, du_id, "*")
            target_path = target_url
            logger.debug(
                "Target and source host are localhost. Processing: %s" %
                (source_path))
            expanded_path = glob.glob(source_path)
            logger.debug("Expanded path: " + str(expanded_path))
            for path in expanded_path:
                if os.path.isdir(path):
                    logger.debug("Source path %s is directory" % path)
                    files = os.listdir(path)
                    for i in files:
                        try:
                            os.symlink(os.path.join(files, i), target_path)
                            os.chmod(
                                os.path.join(target_path,
                                             os.path.basename(path)), 0777)
                        except:
                            self.__print_traceback()
                else:
                    try:
                        os.symlink(
                            path,
                            os.path.join(target_path, os.path.basename(path)))
                        os.chmod(
                            os.path.join(target_path, os.path.basename(path)),
                            0777)
                    except:
                        self.__print_traceback()

        else:
            command = "iget -f -r %s %s" % (du_id, target_url)
            logger.debug(command)
            self.__run_command(command)

            full_path = os.path.join(target_url, du_id)
            #logger.debug("Path: " + str(full_path) + " Exists: " + str(os.path.exists(full_path)))
            #while os.path.exists(full_path)==False:
            #    time.sleep(1)

            for i in os.listdir(full_path):
                try:
                    logger.debug("chmod " + str(i))
                    os.chmod(os.path.join(full_path, i), 0777)
                    logger.debug("move " + str(i))
                    shutil.move(os.path.join(full_path, i), target_url)
                except:
                    self.__print_traceback()

            shutil.rmtree(full_path, ignore_errors=True)
            #time.sleep(2)
            #if target_url==".":
            #    target_url = os.getcwd()
            #command = "mv %s/* %s"%(os.path.join(target_url, du_id), target_url)
            #self.__run_command(command)
            logger.debug("Finished Get DU " + du.id + " in: " +
                         str(time.time() - start) + " sec.")
Beispiel #56
0
    def start_pilot_job(self,
                        lrms_url,
                        number_nodes=1,
                        queue=None,
                        project=None,
                        working_directory=None,
                        userproxy=None,
                        walltime=None,
                        processes_per_node=1,
                        filetransfers=None,
                        spmd_variation=None,
                        external_queue="",
                        pilot_compute_description=None):
        """ Start a batch job (using SAGA Job API) at resource manager. Currently, the following resource manager are supported:
            fork://localhost/ (Default Job Adaptor
            gram://qb1.loni.org/jobmanager-pbs (Globus Adaptor)
            pbspro://localhost (PBS Pro Adaptor)
        
        """
        if self.job != None:
            raise BigJobError(
                "One BigJob already active. Please stop BigJob first.")
            return

        ##############################################################################
        # initialization of coordination and communication subsystem
        # Communication & Coordination initialization
        lrms_saga_url = SAGAUrl(lrms_url)
        self.url = lrms_saga_url
        self.pilot_url = self.app_url + ":" + lrms_saga_url.host
        self.number_nodes = int(number_nodes) * int(processes_per_node)

        # Store references to BJ in global dict
        _pilot_url_dict[self.pilot_url] = self
        _pilot_url_dict[external_queue] = self

        logger.debug("create pilot job entry on backend server: " +
                     self.pilot_url)
        self.coordination.set_pilot_state(self.pilot_url, str(Unknown), False)
        if pilot_compute_description == None:
            pilot_compute_description = {
                "service_url": lrms_url,
                "number_of_processes": number_nodes,
                "processes_per_node": processes_per_node,
                "working_directory": working_directory
            }
        self.coordination.set_pilot_description(self.pilot_url,
                                                pilot_compute_description)
        logger.debug("set pilot state to: " + str(Unknown))

        # Create Job Service (Default: SAGA Job Service, alternative Job Services supported)
        self.js = None
        if lrms_saga_url.scheme == "gce+ssh":
            self.js = GCEService(lrms_saga_url, pilot_compute_description)
        elif lrms_saga_url.scheme=="ec2+ssh" or lrms_saga_url.scheme=="euca+ssh" \
            or lrms_saga_url.scheme=="nova+ssh":
            self.js = EC2Service(lrms_saga_url, pilot_compute_description)
        #elif lrms_saga_url.scheme=="slurm+ssh":
        #    self.js = SlurmService(lrms_saga_url, pilot_compute_description)
        else:
            self.js = self._ocache.get_obj(
                lrms_saga_url, lambda: SAGAJobService(lrms_saga_url))
        ##############################################################################
        # create job description
        jd = SAGAJobDescription()

        #  Attempt to create working directory (e.g. in local scenario)
        if working_directory != None and working_directory != "":
            if not os.path.isdir(working_directory) \
                and (lrms_saga_url.scheme.startswith("fork") or lrms_saga_url.scheme.startswith("condor")) \
                and working_directory.startswith("go:")==False:
                os.mkdir(working_directory)
            self.working_directory = working_directory
        else:
            # if no working dir is set assume use home directory
            # will fail if home directory is not the same on remote machine
            # but this is just a guess to avoid failing
            self.working_directory = "~"
            #self.working_directory = ""

        if queue != None:
            jd.queue = queue
        if spmd_variation != None:
            jd.spmd_variation = spmd_variation
        if project != None:
            jd.project = project
        if walltime != None:
            logger.debug("setting walltime to: " + str(walltime))
            jd.wall_time_limit = int(walltime)

        ##############################################################################
        # File Management and Stage-In
        # Determine whether target machine use gsissh or ssh to logon.
        # logger.debug("Detect launch method for: " + lrms_saga_url.host)
        # self.launch_method = self.__get_launch_method(lrms_saga_url.host,lrms_saga_url.username)
        self.bigjob_working_directory_url = ""
        if lrms_saga_url.scheme.startswith("gce") or lrms_saga_url.scheme.startswith("ec2")\
            or lrms_saga_url.scheme.startswith("euca") or lrms_saga_url.scheme.startswith("nova"):
            logger.debug(
                "File Staging for Cloud Instances currently not supported.")
        elif lrms_saga_url.scheme.startswith("condor") == True:
            logger.debug("Using Condor file staging")
        else:
            # build target url for working directory
            # this will also create the remote directory for the BJ
            # Fallback if working directory is not a valid URL
            if not (self.working_directory.startswith("go:")
                    or self.working_directory.startswith("ssh://")):
                if lrms_saga_url.username != None and lrms_saga_url.username != "":
                    self.bigjob_working_directory_url = "ssh://" + lrms_saga_url.username + "@" + lrms_saga_url.host + "/" + self.__get_bigjob_working_dir(
                    )
                else:
                    self.bigjob_working_directory_url = "ssh://" + lrms_saga_url.host + "/" + self.__get_bigjob_working_dir(
                    )
            elif self.working_directory.startswith("go:"):
                self.bigjob_working_directory_url = os.path.join(
                    self.working_directory, self.uuid)
            else:
                # working directory is a valid file staging URL
                self.bigjob_working_directory_url = self.working_directory

            # initialize file manager that takes care of file movement and directory creation
            if self.__filemanager == None:
                self.__initialize_pilot_data(
                    self.bigjob_working_directory_url)  # determines the url

            if self.__filemanager != None and not self.working_directory.startswith(
                    "/"):
                self.working_directory = self.__filemanager.get_path(
                    self.bigjob_working_directory_url)

            # determine working directory of bigjob
            # if a remote sandbox can be created via ssh => create a own dir for each bj job id
            # otherwise use specified working directory
            logger.debug("BigJob working directory: %s" %
                         self.bigjob_working_directory_url)
            if self.__filemanager != None and self.__filemanager.create_remote_directory(
                    self.bigjob_working_directory_url) == True:
                self.working_directory = self.__get_bigjob_working_dir()
                self.__stage_files(filetransfers,
                                   self.bigjob_working_directory_url)
            else:
                logger.warn("No file staging adaptor found.")

            logger.debug("BJ Working Directory: %s", self.working_directory)

        if lrms_saga_url.scheme.startswith("condor") == False:
            jd.working_directory = self.working_directory
        else:
            jd.working_directory = ""

        ##############################################################################
        # Create and process BJ bootstrap script
        bootstrap_script = self.__generate_bootstrap_script(
            self.coordination.get_address(),
            self.pilot_url,  # Queue 1 used by this BJ object 
            external_queue  # Queue 2 used by Pilot Compute Service 
            # or another external scheduler
        )
        logger.debug("Adaptor specific modifications: " +
                     str(lrms_saga_url.scheme))
        bootstrap_script = self.__escape_pbs(bootstrap_script)
        #bootstrap_script = self.__escape_ssh(bootstrap_script)
        logger.debug(bootstrap_script)

        # Define Agent Executable in Job description
        # in Condor case bootstrap script is staged
        # (Python app cannot be passed inline in Condor job description)
        if lrms_saga_url.scheme.startswith("condor") == True:

            bootstrap_script = self.__generate_bootstrap_script_from_binary(
                self.coordination.get_address(),
                self.pilot_url,  # Queue 1 used by this BJ object 
                external_queue  # Queue 2 used by Pilot Compute Service 
                # or another external scheduler
            )

            condor_bootstrap_filename = os.path.join(
                "/tmp", "bootstrap-" + str(self.uuid))
            condor_bootstrap_file = open(condor_bootstrap_filename, "w")
            condor_bootstrap_file.write(bootstrap_script)
            condor_bootstrap_file.close()
            logger.debug("Using Condor - bootstrap file: " +
                         condor_bootstrap_filename)

            jd.executable = "/usr/bin/env"
            jd.arguments = [
                "python",
                os.path.basename(condor_bootstrap_filename)
            ]
            if pilot_compute_description.has_key("candidate_hosts"):
                jd.candidate_hosts = pilot_compute_description[
                    "candidate_hosts"]
            bj_file_transfers = []
            file_transfer_spec = condor_bootstrap_filename + " > " + os.path.basename(
                condor_bootstrap_filename)
            bj_file_transfers.append(file_transfer_spec)
            output_file_name = "output-" + str(self.uuid) + ".tar.gz"
            #output_file_transfer_spec = os.path.join(self.working_directory, output_file_name) +" < " + output_file_name
            output_file_transfer_spec = output_file_name + " < " + output_file_name
            #output_file_transfer_spec = os.path.join(self.working_directory, "output.tar.gz") +" < output.tar.gz"
            #logger.debug("Output transfer: " + output_file_transfer_spec)
            #bj_file_transfers.append(output_file_transfer_spec)
            if filetransfers != None:
                for t in filetransfers:
                    bj_file_transfers.append(t)
            logger.debug("Condor file transfers: " + str(bj_file_transfers))
            jd.file_transfer = bj_file_transfers
        else:
            jd.total_cpu_count = int(number_nodes)
            jd.spmd_variation = "single"
            if pilot_compute_description != None and pilot_compute_description.has_key(
                    "spmd_variation"):
                jd.spmd_variation = pilot_compute_description["spmd_variation"]
            jd.arguments = ["python", "-c", bootstrap_script]
            jd.executable = "/usr/bin/env"

        logger.debug("Working directory: " + jd.working_directory +
                     " Job Description: " + str(jd))

        jd.output = os.path.join(self.working_directory,
                                 "stdout-" + self.uuid + "-agent.txt")
        jd.error = os.path.join(self.working_directory,
                                "stderr-" + self.uuid + "-agent.txt")

        ##############################################################################
        # Create and submit pilot job to job service
        logger.debug("Creating pilot job with description: %s" % str(jd))
        self.job = self.js.create_job(jd)
        logger.debug("Trying to submit pilot job to: " + str(lrms_saga_url))
        self.job.run()

        if self.job.get_state() == saga.job.FAILED:
            logger.debug("SUBMISSION FAILED. Exiting... ")
            sys.exit(-1)
        else:
            logger.debug("Submission succeeded. Job ID: %s " % self.job.id)

        return self.pilot_url
Beispiel #57
0
 def get_cds_url(cls, application_url, cds_id):
     cds_url = application_url + RedisCoordinationAdaptor.SEPARATOR + cds_id
     logger.debug("CDS URL: %s" % (cds_url))
     return cds_url
Beispiel #58
0
 def add_pds(cls, application_url, pds):
     pds_url_no_dbtype = cls.get_pds_url(application_url, pds.id)
     pds_url = cls.__get_url(pds_url_no_dbtype)
     logger.debug("Create PDS directory at %s" % pds_url)
     return pds_url_no_dbtype
Beispiel #59
0
 def get_pds_url(cls, application_url, pds_id):
     pds_url = application_url + RedisCoordinationAdaptor.SEPARATOR + pds_id
     logger.debug("PDS URL: %s" % (pds_url))
     return pds_url
Beispiel #60
0
 def get_base_url(cls, application_id):
     surl = SAGAUrl(cls.BASE_URL)
     base_url = surl.scheme + "://" + surl.host + "/" + application_id + "/"
     logger.debug(base_url)
     return base_url