Example #1
0
 def export(self, target_url):
     """ simple implementation of export: 
             copies file from first pilot data to local machine
     """
     if self.get_state()!=State.Running:
         self.wait()
     
     if len(self.pilot_data) > 0:
         # Search for PD that is close to local machine
         local_hostname=socket.getfqdn()
         max_score=0
         best_pd=None
         for pd in self.pilot_data:
             pd_host = SAGAUrl(pd.service_url).host
             pd_score = difflib.SequenceMatcher(a=pd_host, b=local_hostname).ratio()
             logger.debug("Export locality compute score: Localhost: %s PD at: %s Score: %s"%(local_hostname, pd_host, pd_score))
             if pd_score > max_score:
                 best_pd=pd
                 max_score=pd_score
             
             #pd_domain = tldextract.extract(pd.service_url).domain
             #local_domain = tldextract.extract(socket.getfqdn()).domain
             
         if best_pd!=None:
             logger.debug("Export from: %s"%(best_pd.service_url))
             best_pd.export_du(self, target_url)
             return
             
         # No PD found. Utilize default PD
         logger.debug("Export from random PD")
         self.pilot_data[0].export_du(self, target_url)
     else:
         logger.error("No Pilot Data for PD found")
Example #2
0
 def __parse_url(self, url):
     try:
         surl = saga.url(url)
         host = surl.host
         port = surl.port
         username = surl.username
         password = surl.password
         query = surl.query
         scheme = "%s://"%surl.scheme
     except:
         """ Fallback URL parser based on Python urlparse library """
         logger.error("URL %s could not be parsed")
         traceback.print_exc(file=sys.stderr)
         result = urlparse.urlparse(url)
         host = result.hostname
         port = result.port
         username = result.username
         password = result.password
         if url.find("?")>0:
             query = url[url.find("?")+1:]
         else:
             query = None
         scheme = "%s://"%result.scheme
         
     return scheme, username, password, host, port, query     
Example #3
0
 def get_base_url(cls, application_id):
     if cls.BASE_URL==None:
         logger.error("Coordination URL not set. Exiting Pilot-Data.")
         raise Exception("Coordination URL not set. Exiting Pilot-Data.")
     surl = saga.Url(cls.BASE_URL)
     base_url = surl.scheme + "://" + surl.host + "/" + application_id 
     logger.debug(base_url)
     return base_url
Example #4
0
 def get_base_url(cls, application_id):
     if cls.BASE_URL == None:
         logger.error("Coordination URL not set. Exiting Pilot-Data.")
         raise Exception("Coordination URL not set. Exiting Pilot-Data.")
     surl = SAGAUrl(cls.BASE_URL)
     base_url = surl.scheme + "://" + surl.host + "/" + application_id
     logger.debug(base_url)
     return base_url
Example #5
0
 def export(self, target_url):
     """ simple implementation of export: 
             copies file from first pilot store to local machine
     """
     if len(self.pilot_stores) > 0:
         self.pilot_stores[0].export_pd(self, target_url)
     else:
         logger.error("No Pilot Store for PD found")
Example #6
0
 def export(self, target_url):
     """ simple implementation of export: 
             copies file from first pilot data to local machine
     """
     if len(self.pilot_data) > 0:
         self.pilot_data[0].export_du(self, target_url)
     else:
         logger.error("No Pilot Data for PD found")
Example #7
0
    def __init__(self,
                 coordination_url="advert://localhost/?dbtype=sqlite3",
                 pilot_url=None):
        """ Initializes BigJob's coordination system
            advert://localhost (SAGA/Advert SQLITE)
            advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL)
            redis://localhost:6379 (Redis at localhost)
            tcp://localhost (ZMQ)
            
            The following formats for pilot_url are supported:
            
            
            1.) Including root path at distributed coordination service:
            redis://localhost/bigjob:bj-1c3816f0-ad5f-11e1-b326-109addae22a3:localhost
            
            This path is returned when call bigjob.get_url()
            
            2.) BigJob unique ID:
            bigjob:bj-1c3816f0-ad5f-11e1-b326-109addae22a3:localhost
            
            
        """

        self.coordination_url = coordination_url
        if self.coordination_url == None:
            logger.error("Coordination URL not set. Exiting BigJob.")
        #self.launch_method=""
        self.__filemanager = None
        self._ocache = ObjectCache()

        # restore existing BJ or initialize new BJ
        if pilot_url != None:
            logger.debug("Reconnect to BJ: %s" % pilot_url)
            if pilot_url.startswith("bigjob:"):
                self.pilot_url = pilot_url
            else:
                self.coordination_url, self.pilot_url = self.__parse_pilot_url(
                    pilot_url)

            self.uuid = self.__get_bj_id(pilot_url)
            self.app_url = self.__APPLICATION_NAME + ":" + str(self.uuid)
            self.job = None
            self.working_directory = None
            # Coordination subsystem must be initialized before get_state_detail
            self.coordination = self.__init_coordination(self.coordination_url)
            self.state = self.get_state_detail()
            _pilot_url_dict[self.pilot_url] = self
        else:
            self.coordination = self.__init_coordination(self.coordination_url)
            self.uuid = "bj-" + str(get_uuid())
            logger.debug("init BigJob w/: " + coordination_url)
            self.app_url = self.__APPLICATION_NAME + ":" + str(self.uuid)
            self.state = Unknown
            self.pilot_url = ""
            self.job = None
            self.working_directory = None
            logger.debug("initialized BigJob: " + self.app_url)
Example #8
0
 def __init__(self, 
              coordination_url="advert://localhost/?dbtype=sqlite3", 
              pilot_url=None):    
     """ Initializes BigJob's coordination system
         advert://localhost (SAGA/Advert SQLITE)
         advert://advert.cct.lsu.edu:8080 (SAGA/Advert POSTGRESQL)
         redis://localhost:6379 (Redis at localhost)
         tcp://localhost (ZMQ)
         
         The following formats for pilot_url are supported:
         
         
         1.) Including root path at distributed coordination service:
         redis://localhost/bigjob:bj-1c3816f0-ad5f-11e1-b326-109addae22a3:localhost
         
         This path is returned when call bigjob.get_url()
         
         2.) BigJob unique ID:
         bigjob:bj-1c3816f0-ad5f-11e1-b326-109addae22a3:localhost
         
         
     """  
     
     self.coordination_url = coordination_url
     if self.coordination_url==None:
         logger.error("Coordination URL not set. Exiting BigJob.")
     #self.launch_method=""
     self.__filemanager=None
     self._ocache = ObjectCache ()
     
     # restore existing BJ or initialize new BJ
     if pilot_url!=None:
         logger.debug("Reconnect to BJ: %s"%pilot_url)
         if pilot_url.startswith("bigjob:"):
             self.pilot_url=pilot_url
         else:
             self.coordination_url, self.pilot_url = self.__parse_pilot_url(pilot_url)
             
         self.uuid = self.__get_bj_id(pilot_url)
         self.app_url = self.__APPLICATION_NAME +":" + str(self.uuid)
         self.job = None
         self.working_directory = None
         # Coordination subsystem must be initialized before get_state_detail
         self.coordination = self.__init_coordination(self.coordination_url)
         self.state=self.get_state_detail()
         _pilot_url_dict[self.pilot_url]=self
     else:
         self.coordination = self.__init_coordination(self.coordination_url)
         self.uuid = "bj-" + str(get_uuid())        
         logger.debug("init BigJob w/: " + coordination_url)
         self.app_url =self. __APPLICATION_NAME +":" + str(self.uuid) 
         self.state=Unknown
         self.pilot_url=""
         self.job = None
         self.working_directory = None
         logger.debug("initialized BigJob: " + self.app_url)
Example #9
0
 def export(self, target_url):
     """ simple implementation of export: 
             copies file from first pilot data to local machine
     """
     if self.state != State.Running:
         self.wait()
     if len(self.pilot_data) > 0:
         self.pilot_data[0].export_du(self, target_url)
     else:
         logger.error("No Pilot Data for PD found")
Example #10
0
    def add_pilot_compute_service(self, pcs):
        """ Add a PilotComputeService to this CDS.

            @param pcs: The PilotComputeService to which this ComputeDataService will connect.

        """
        self.pilot_job_services.append(pcs)
        CoordinationAdaptor.update_cds(self.url, self)
        if len(self.pilot_job_services)>1:
            logger.error("Decentral ComputeDataService only supports 1 PilotComputeService")
            raise PilotError("Decentral ComputeDataService only supports 1 PilotComputeService")
Example #11
0
    def add_pilot_compute_service(self, pcs):
        """ Add a PilotComputeService to this CDS.

            @param pcs: The PilotComputeService to which this ComputeDataService will connect.

        """
        self.pilot_job_services.append(pcs)
        CoordinationAdaptor.update_cds(self.url, self)
        if len(self.pilot_job_services)>1:
            logger.error("Decentral ComputeDataService only supports 1 PilotComputeService")
            raise PilotError("Decentral ComputeDataService only supports 1 PilotComputeService")
Example #12
0
 def _get_du_id(cls, du_url):
     try:
         start = du_url.index(cls.DU_ID_PREFIX)
         end = du_url.find("/", start)
         if end == -1:
             end = du_url.find("?", start)
         if end == -1:
             end = len(du_url)
         return du_url[start:end]
     except:
         logger.error("No valid PD URL")
     return None
Example #13
0
 def put_pd(self, pd):
     for i in pd.list_data_units():     
         remote_path = os.path.join(self.__get_pd_path(pd.id), os.path.basename(i.local_url))
         logger.debug("Put file: %s to %s"%(i.local_url, remote_path))
                     
         if i.local_url.startswith("file://") or i.local_url.startswith("/"):
             if stat.S_ISDIR(os.stat(i.local_url).st_mode):
                 logger.warning("Path %s is a directory. Ignored."%i.local_url)                
                 continue            
             self.__webhdfs.copyFromLocal(i.local_url, remote_path)
         else:
             logger.error("File URLs: %s not supported"%i.local_url)
Example #14
0
 def __init__(self, server=REDIS_SERVER, server_port=REDIS_SERVER_PORT, server_connect_url=None,
              username=None, password=None, dbtype=None, url_prefix=None):
     '''
     Constructor
     '''
     if server_port==None:
         server_port=6379
         
     self.username = None
     self.password = None   
     
     self.address = "%s%s:%i"%(REDIS_URL_SCHEME, server, server_port)
     self.dbtype=""
     #self.redis_adaptor_start_time = datetime.datetime.utcnow().strftime("%s") 
     self.redis_adaptor_start_time = time.time()
     
     if server_connect_url!=None:
         self.address=server_connect_url    
         start_index = self.address.find(REDIS_URL_SCHEME)+len(REDIS_URL_SCHEME)
         server_and_port = self.address[start_index:]
         password_end = server_and_port.find("@") 
         # parse out password
         if password_end != -1:
             self.password = server_and_port[:password_end]
             start_index=password_end
             server_and_port= server_and_port[(password_end+1):]
         
         # port and hostname
         if server_and_port.find(":")==-1:
             server=server_and_port
             server_port = REDIS_SERVER_PORT
         else:
             server = server_and_port.split(":")[0]
             server_port = int(server_and_port.split(":")[1])
     else:
         self.password = username
         if self.password != None and self.password!="":
             self.address = "%s%s@%s:%i"%(REDIS_URL_SCHEME, self.password, server, server_port)
     
     logger.debug("Connect to Redis: " + server + " Port: " + str(server_port))
     
     if self.password==None:
         self.redis_client = redis.Redis(host=server, port=server_port, db=0)
     else:
         self.redis_client = redis.Redis(host=server, port=server_port, password=self.password, db=0)
     #self.redis_client_pubsub = self.redis_client.pubsub() # redis pubsub client       
     self.resource_lock = threading.RLock()
     self.pipe = self.redis_client.pipeline()
     try:
         self.redis_client.ping()
     except Exception, ex:
         logger.error("Cannot connect to Redis server: %s" % str(ex))
         raise Exception("Cannot connect to Redis server: %s" % str(ex))
Example #15
0
 def __get_pd_id(self, pd_url):
     try:
         start = pd_url.index(self.PD_ID_PREFIX)
         end = pd_url.find("/", start)
         if end==-1:
             end = pd_url.find("?", start)
         if end==-1:
             end = len(pd_url)-1
         return pd_url[start:end]
     except:
         logger.error("No valid PD URL")
     return None
Example #16
0
 def __get_du_id(self, du_url):
     try:
         start = du_url.index(self.DU_ID_PREFIX)
         end = du_url.find("/", start)
         if end==-1:
             end = du_url.find("?", start)
         if end==-1:
             end = len(du_url)-1
         return du_url[start:end]
     except:
         logger.error("No valid PD URL")
     return None
Example #17
0
    def _scheduler_thread(self):
        while True and self.stop.isSet() == False:
            try:
                #logger.debug("Scheduler Thread: " + str(self.__class__) + " Pilot Data")
                du = self.du_queue.get(True, 1)
                # check whether this is a real du object
                if isinstance(du, DataUnit):
                    pd = self._schedule_du(du)
                    if (pd != None):
                        logger.debug("Initiate Transfer to PD.")
                        du.add_pilot_data(pd)
                        logger.debug("Transfer to PD finished.")
                        du._update_state(State.Running)
                        self.du_queue.task_done()
                    else:
                        self.du_queue.task_done()
                        self.du_queue.put(du)
            except Queue.Empty:
                pass

            try:
                #logger.debug("Scheduler Thread: " + str(self.__class__) + " Pilot Job")
                cu = self.cu_queue.get(True, 1)
                if isinstance(cu, ComputeUnit):
                    self.__wait_for_du(cu)
                    pj = self._schedule_cu(cu)
                    if pj != None:
                        cu = self.__expand_working_directory(cu, pj)
                        pj._submit_cu(cu)
                        self.cu_queue.task_done()
                    else:
                        logger.debug("No resource found.")
                        self.cu_queue.task_done()
                        self.cu_queue.put(cu)
            except Queue.Empty:
                pass
            except:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                logger.error("*** print_tb:")
                traceback.print_tb(exc_traceback, limit=1, file=sys.stderr)
                logger.error("*** print_exception:")
                traceback.print_exception(exc_type,
                                          exc_value,
                                          exc_traceback,
                                          limit=2,
                                          file=sys.stderr)

            if self.cu_queue.empty() and self.du_queue.empty():
                time.sleep(5)

        logger.debug("Re-Scheduler terminated")
Example #18
0
    def __init__(self, service_url):

        self.service_url = service_url

        try:
            result = urlparse.urlparse(service_url)
            self.host = result.netloc
            self.path = result.path
        except:
            logger.error("Error parsing URL.")

        self.__state = State.New
        self.__webhdfs = WebHDFS(self.HDFS_SERVICE_HOST,
                                 self.HDFS_SERVICE_PORT, self.HDFS_USER_NAME)
Example #19
0
 def __stage_in_data_units(self, input_data=[], target_directory="."):
     """ stage in data units specified in input_data field """
     try:
         logger.debug("Stage in input files to: %s" % target_directory)
         for i in input_data:
             du = DataUnit(du_url=i)
             logger.debug("Restored DU... call get state()")
             logger.debug("DU State: " + du.get_state())
             du.wait()
             logger.debug("Reconnected to DU. Exporting it now...")
             du.export(target_directory)
     except:
         logger.error("Stage-in of files failed.")
         self.__print_traceback()
Example #20
0
 def __stage_in_data_units(self, input_data=[], target_directory="."):
     """ stage in data units specified in input_data field """
     try:
         logger.debug("Stage in input files to: %s"%target_directory)
         for i in input_data:
             du = DataUnit(du_url=i)
             logger.debug("Restored DU... call get state()")
             logger.debug("DU State: " + du.get_state())
             du.wait()
             logger.debug("Reconnected to DU. Exporting it now...")
             du.export(target_directory)
     except:
         logger.error("Stage-in of files failed.")
         self.__print_traceback()
Example #21
0
    def __init__(self, pjs_url=None):
        """ Create a PilotJobService object.

            Keyword arguments:
            pjs_id -- Don't create a new, but connect to an existing (optional)
        """
        self.__mjs = None
        self.pilot_computes=[]
        
        if pjs_url==None:      # new pjs          
            self.id = self.PJS_ID_PREFIX+str(uuid.uuid1())
            self.url = "pilotjob://localhost/"+self.id
        else:
            logger.error("Reconnect to PJS currently not supported.")
Example #22
0
 def __init_coordination(self, coordination_url):        
     if(coordination_url.startswith("advert://") or coordination_url.startswith("sqlasyncadvert://")):
         try:
             from coordination.bigjob_coordination_advert import bigjob_coordination
             logger.debug("Utilizing ADVERT Backend")
         except:
             logger.error("Advert Backend could not be loaded")
     elif (coordination_url.startswith("redis://")):
         try:
             from coordination.bigjob_coordination_redis import bigjob_coordination      
             logger.debug("Utilizing Redis Backend")
         except:
             logger.error("Error loading pyredis.")
     elif (coordination_url.startswith("tcp://")):
         try:
             from coordination.bigjob_coordination_zmq import bigjob_coordination
             logger.debug("Utilizing ZMQ Backend")
         except:
             logger.error("ZMQ Backend not found. Please install ZeroMQ (http://www.zeromq.org/intro:get-the-software) and " 
                   +"PYZMQ (http://zeromq.github.com/pyzmq/)")
     else:
         logger.error("No suitable coordination backend found.")
     
     logger.debug("Parsing URL: " + coordination_url)
     scheme, username, password, host, port, dbtype  = self.__parse_url(coordination_url) 
     
     if port == -1:
         port = None
     coordination = bigjob_coordination(server=host, server_port=port, username=username, 
                                        password=password, dbtype=dbtype, url_prefix=scheme)
     return coordination
Example #23
0
    def put_pd(self, pd):
        for i in pd.list_data_units():
            remote_path = os.path.join(self.__get_pd_path(pd.id),
                                       os.path.basename(i.local_url))
            logger.debug("Put file: %s to %s" % (i.local_url, remote_path))

            if i.local_url.startswith("file://") or i.local_url.startswith(
                    "/"):
                if stat.S_ISDIR(os.stat(i.local_url).st_mode):
                    logger.warning("Path %s is a directory. Ignored." %
                                   i.local_url)
                    continue
                self.__webhdfs.copyFromLocal(i.local_url, remote_path)
            else:
                logger.error("File URLs: %s not supported" % i.local_url)
Example #24
0
 def __init__(self, service_url):     
     
     self.service_url = service_url
     
     try:
         result = urlparse.urlparse(service_url)
         self.host = result.netloc
         self.path = result.path        
     except:
         logger.error("Error parsing URL.")
         
     self.__state=State.New
     self.__webhdfs= WebHDFS(self.HDFS_SERVICE_HOST, 
                            self.HDFS_SERVICE_PORT,
                            self.HDFS_USER_NAME)
Example #25
0
    def _scheduler_thread(self):
        while True and self.stop.isSet()==False:            
            try:
                #logger.debug("Scheduler Thread: " + str(self.__class__) + " Pilot Data")
                du = self.du_queue.get(True, 1)  
                # check whether this is a real du object  
                if isinstance(du, DataUnit):
                    pd=self._schedule_du(du)                
                    if(pd!=None):                        
                        logger.debug("Initiate Transfer to PD.")
                        du.add_pilot_data(pd)
                        logger.debug("Transfer to PD finished.")
                        du._update_state(State.Running) 
                        self.du_queue.task_done()                   
                    else:
                        self.du_queue.task_done() 
                        self.du_queue.put(du)
            except Queue.Empty:
                pass
                    
            try:    
                #logger.debug("Scheduler Thread: " + str(self.__class__) + " Pilot Job")
                cu = self.cu_queue.get(True, 1)                
                if isinstance(cu, ComputeUnit):  
                    self.__wait_for_du(cu)                  
                    pj=self._schedule_cu(cu) 
                    if pj !=None:
                        cu = self.__expand_working_directory(cu, pj)                        
                        pj._submit_cu(cu)           
                        self.cu_queue.task_done()         
                    else:
                        logger.debug("No resource found.")
                        self.cu_queue.task_done() 
                        self.cu_queue.put(cu)
            except Queue.Empty:
                pass
            except:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                logger.error("*** print_tb:")
                traceback.print_tb(exc_traceback, limit=1, file=sys.stderr)
                logger.error("*** print_exception:")
                traceback.print_exception(exc_type, exc_value, exc_traceback,
                              limit=2, file=sys.stderr)
            
            if self.cu_queue.empty() and self.du_queue.empty():
                time.sleep(5)        

        logger.debug("Re-Scheduler terminated")
Example #26
0
    def __init__(self, coordination_url=COORDINATION_URL, pcs_url=None):
        """ Create a PilotJobService object.

            Keyword arguments:
            pcs_id -- Don't create a new, but connect to an existing (optional)
        """
        self.pilot_computes=[]
        self.coordination_url=coordination_url
        self.coordination_queue=""
        if pcs_url==None:      # new pcs          
            self.id = self.PJS_ID_PREFIX+str(uuid.uuid1())
            self.url = os.path.join(self.coordination_url, "pcs", self.id)
            self.coordination_queue = "PilotComputeServiceQueue-" + str(self.id)
            logger.debug("Created Pilot Compute Service: %s"%self.url)
        else:
            logger.error("Reconnect to PilotComputeService currently not supported.")
Example #27
0
 def __create_remote_directory(self, target_url):
     result = urlparse.urlparse(target_url)
     target_host = result.netloc
     target_path = result.path
     try:
         client = paramiko.SSHClient()
         client.load_system_host_keys()
         client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
         client.connect(target_host)
         sftp = client.open_sftp()            
         sftp.mkdir(target_path)
         sftp.close()
         client.close()
     except:
         logger.error("Error creating directory: " + str(target_path) 
                      + " at: " + str(target_host))
         self.__print_traceback()
Example #28
0
 def __get_redis_api_client(cls):
     import redis
     ''' Initialize Redis API Client     '''
     saga_url = saga.Url(RedisCoordinationAdaptor.BASE_URL)
     username = saga_url.username
     server = saga_url.host
     server_port = saga_url.port
     if username==None or username=="":
         redis_client = redis.Redis(host=server, port=server_port, db=0)
     else:
         redis_client = redis.Redis(host=server, port=server_port, password=username, db=0)
     
     try:
         redis_client.ping()
     except:
         logger.error("Please start Redis server!")
         raise Exception("Please start Redis server!")
     return redis_client
Example #29
0
    def remove_pilot_compute_service(self, pcs):
        """ Remove a PilotJobService from this CDS.

            Note that it won't cancel the PilotJobService, it will just no
            longer be connected to this WUS.

            Keyword arguments:
            pilotjob_services -- The PilotJob Service(s) to remove from this
                                 Work Unit Service. 

            Return:
            Result
        """
        self.pilot_job_services.remove(pcs)
        CoordinationAdaptor.update_cds(self.url, self)
        if len(self.pilot_job_services)>1:
            logger.error("Decentral ComputeDataService only supports 1 PilotComputeService")
            raise PilotError("Decentral ComputeDataService only supports 1 PilotComputeService")
Example #30
0
    def __init__(self, coordination_url=COORDINATION_URL, pcs_url=None):
        """ Create a PilotJobService object.

            Keyword arguments:
            pcs_id -- Don't create a new, but connect to an existing (optional)
        """
        self.pilot_computes = []
        self.coordination_url = coordination_url
        self.coordination_queue = ""
        if pcs_url == None:  # new pcs
            self.id = self.PJS_ID_PREFIX + str(uuid.uuid1())
            self.url = os.path.join(self.coordination_url, "pcs", self.id)
            self.coordination_queue = "PilotComputeServiceQueue-" + str(
                self.id)
            logger.debug("Created Pilot Compute Service: %s" % self.url)
        else:
            logger.error(
                "Reconnect to PilotComputeService currently not supported.")
Example #31
0
    def remove_pilot_compute_service(self, pcs):
        """ Remove a PilotJobService from this CDS.

            Note that it won't cancel the PilotJobService, it will just no
            longer be connected to this WUS.

            Keyword arguments:
            pilotjob_services -- The PilotJob Service(s) to remove from this
                                 Work Unit Service. 

            Return:
            Result
        """
        self.pilot_job_services.remove(pcs)
        CoordinationAdaptor.update_cds(self.url, self)
        if len(self.pilot_job_services)>1:
            logger.error("Decentral ComputeDataService only supports 1 PilotComputeService")
            raise PilotError("Decentral ComputeDataService only supports 1 PilotComputeService")
Example #32
0
 def __stage_files(self, filetransfers, target_url):
     logger.debug("Stage: %s to %s"%(filetransfers, target_url))
     if filetransfers==None:
         return
     if self.__filemanager:
         self.__filemanager.create_remote_directory(target_url)
     for i in filetransfers:
         source_file=i
         if i.find(">")>0:
             source_file = i[:i.find(">")].strip()
         if source_file.startswith("ssh://")==False and source_file.startswith("go://")==False:
             logger.error("Staging of file: %s not supported. Please use URL in form ssh://<filename>"%source_file)
             continue
         target_url_full = os.path.join(target_url, os.path.basename(source_file))
         logger.debug("Stage: %s to %s"%(source_file, target_url_full))
         #self.__third_party_transfer(source_file, target_url_full)
         if self.__filemanager:
             self.__filemanager.transfer(source_file, target_url_full)
Example #33
0
    def __init__(self, redis_url):
        '''
        Constructor
        '''
        server_port = 6379
        self.redis_url = redis_url
        self.password = None
        start_index = self.redis_url.find(REDIS_URL_SCHEME) + len(
            REDIS_URL_SCHEME)
        server_and_port = self.redis_url[start_index:]
        password_end = server_and_port.find("@")
        # parse out password
        if password_end != -1:
            self.password = server_and_port[:password_end]
            start_index = password_end
            server_and_port = server_and_port[(password_end + 1):]

        # port and hostname
        if server_and_port.find(":") == -1:
            server = server_and_port
            server_port = REDIS_SERVER_PORT
        else:
            server = server_and_port.split(":")[0]
            server_port = int(server_and_port.split(":")[1])

        logger.debug("Connect to Redis: " + server + " Port: " +
                     str(server_port))

        if self.password == None:
            self.redis_client = redis.Redis(host=server,
                                            port=server_port,
                                            db=0)
        else:
            self.redis_client = redis.Redis(host=server,
                                            port=server_port,
                                            password=self.password,
                                            db=0)

        self.pipe = self.redis_client.pipeline()
        try:
            self.redis_client.ping()
        except:
            logger.error("Please start Redis server!")
            raise Exception("Please start Redis server!")
Example #34
0
    def __stage_out_data_units(self, output_data=[], workingdirectory=None):
        """ stage out data to a specified data unit pilot data """
        logger.debug("Stage out output files")
        """ Parsing output data field of job description:
            {
            ...
             "output_data": [
                            {
                             output_data_unit.get_url(): 
                             ["stdout.txt", "stderr.txt"]
                            }
                            ]
            }    
        """
        try:
            for data_unit_dict in output_data:
                logger.debug("Process: " + str(data_unit_dict))
                for du_url in data_unit_dict.keys(
                ):  # go through all dicts (each representing 1 PD)
                    #pd_url = self.__get_pd_url(du_url)
                    #pilot_data = PilotData(pd_url=pd_url)
                    #du = pilot_data.get_du(du_url)
                    du = DataUnit(du_url=du_url)
                    file_list = data_unit_dict[du_url]
                    logger.debug("Add files: " + str(file_list))
                    all_files = []
                    for output_file in file_list:
                        expanded_files = [output_file]
                        if output_file.find("*") >= 0 or output_file.find(
                                "?") >= 0:
                            expanded_files = self.__expand_file_pattern(
                                output_file, workingdirectory)
                            logger.debug("Expanded files: " +
                                         str(expanded_files))

                        for f in expanded_files:
                            all_files.append(os.path.join(workingdirectory, f))

                    du.add_files(all_files)
                    for f in all_files:
                        os.remove(f)
        except:
            logger.error("Stage out of files failed.")
            self.__print_traceback()
Example #35
0
 def __parse_url(self, url):
     try:
         surl = SAGAUrl(url)
         host = surl.host
         port = surl.port
         username = surl.username
         password = surl.password
         query = surl.query
         if query!=None and query.endswith("/"):
             query = query[:-1]
         scheme = "%s://"%surl.scheme
     except:
         """ Fallback URL parser based on Python urlparse library """
         logger.error("URL %s could not be parsed"%(url))
         traceback.print_exc(file=sys.stderr)
         result = urlparse.urlparse(url)
         logger.debug("Result: " + str(result))
         host = result.hostname
         #host = None
         port = result.port
         username = result.username
         password = result.password
         scheme = "%s://"%result.scheme 
         if host==None:
             logger.debug("Python 2.6 fallback")
             if url.find("/", len(scheme)) > 0:
                 host = url[len(scheme):url.find("/", len(scheme))]
             else:
                 host = url[len(scheme):]
             if host.find(":")>1:
                 logger.debug(host)
                 comp = host.split(":")
                 host = comp[0]
                 port = int(comp[1])
                 
         if url.find("?")>0:
             query = url[url.find("?")+1:]
         else:
             query = None
         
     
     logger.debug("%s %s %s"%(scheme, host, port))
     return scheme, username, password, host, port, query     
Example #36
0
    def __parse_url(self, url):
        try:
            surl = SAGAUrl(url)
            host = surl.host
            port = surl.port
            username = surl.username
            password = surl.password
            query = surl.query
            if query != None and query.endswith("/"):
                query = query[:-1]
            scheme = "%s://" % surl.scheme
        except:
            """ Fallback URL parser based on Python urlparse library """
            logger.error("URL %s could not be parsed" % (url))
            traceback.print_exc(file=sys.stderr)
            result = urlparse.urlparse(url)
            logger.debug("Result: " + str(result))
            host = result.hostname
            #host = None
            port = result.port
            username = result.username
            password = result.password
            scheme = "%s://" % result.scheme
            if host == None:
                logger.debug("Python 2.6 fallback")
                if url.find("/", len(scheme)) > 0:
                    host = url[len(scheme):url.find("/", len(scheme))]
                else:
                    host = url[len(scheme):]
                if host.find(":") > 1:
                    logger.debug(host)
                    comp = host.split(":")
                    host = comp[0]
                    port = int(comp[1])

            if url.find("?") > 0:
                query = url[url.find("?") + 1:]
            else:
                query = None

        logger.debug("%s %s %s" % (scheme, host, port))
        return scheme, username, password, host, port, query
Example #37
0
 def start_new_job_in_thread(self, job_url):
     """evaluates job dir, sanity checks, executes job """
     #pdb.set_trace()
     if job_url != None:
         failed = False;
         try:
             logger.debug("Get job description")
             job_dict = self.coordination.get_job(job_url)
         except:
             logger.error("Failed to get job description")
             failed=True
             
         if job_dict==None or failed==True:
             self.coordination.queue_job(self.pilot_url, job_url)
             
         logger.debug("start job: " + job_url + " data: " + str(job_dict))
         if(job_dict["state"]==str(bigjob.state.Unknown)):
             job_dict["state"]=str(bigjob.state.New)
             self.coordination.set_job_state(job_url, str(bigjob.state.New))
         self.execute_job(job_url, job_dict)
Example #38
0
    def start_new_job_in_thread(self, job_url):
        """evaluates job dir, sanity checks, executes job """
        #pdb.set_trace()
        if job_url != None:
            failed = False
            try:
                logger.debug("Get job description")
                job_dict = self.coordination.get_job(job_url)
            except:
                logger.error("Failed to get job description")
                failed = True

            if job_dict == None or failed == True:
                self.coordination.queue_job(self.pilot_url, job_url)

            logger.debug("start job: " + job_url + " data: " + str(job_dict))
            if (job_dict["state"] == str(bigjob.state.Unknown)):
                job_dict["state"] = str(bigjob.state.New)
                self.coordination.set_job_state(job_url, str(bigjob.state.New))
            self.execute_job(job_url, job_dict)
Example #39
0
 def copy_pd_to_url(self, pd,  local_url, remote_url):
     
     if not remote_url.startswith("file://") and not remote_url.startswith("/"):
         logger.error("Only local URLs supported")
         return
     
     result = urlparse.urlparse(remote_url)
     path = result.path    
     # create directory
     try:
         os.makedirs(path)
     except:
         logger.debug("Directory: %s already exists."%path)
         
     base_dir = self.__get_pd_path(pd.id)
     for filename in self.__webhdfs.listdir(base_dir):
         file_url = local_url + "/" + filename
         file_remote_url = remote_url + "/" + filename
         logger.debug("GET " + file_url + " to " + file_remote_url)
         self.__webhdfs.copyToLocal(file_url, file_remote_url)
Example #40
0
 def __stage_out_data_units(self, output_data=[], workingdirectory=None):
     """ stage out data to a specified data unit pilot data """
     logger.debug("Stage out output files")
     
     """ Parsing output data field of job description:
         {
         ...
          "output_data": [
                         {
                          output_data_unit.get_url(): 
                          ["stdout.txt", "stderr.txt"]
                         }
                         ]
         }    
     """
     try:
         for data_unit_dict in output_data: 
             logger.debug("Process: " + str(data_unit_dict))
             for du_url in data_unit_dict.keys(): # go through all dicts (each representing 1 PD) 
                 #pd_url = self.__get_pd_url(du_url)
                 #pilot_data = PilotData(pd_url=pd_url)
                 #du = pilot_data.get_du(du_url)
                 du = DataUnit(du_url=du_url)
                 file_list = data_unit_dict[du_url]
                 logger.debug("Add files: " + str(file_list))
                 all_files=[]
                 for output_file in file_list:
                     expanded_files = [output_file]
                     if output_file.find("*")>=0 or output_file.find("?")>=0:
                         expanded_files = self.__expand_file_pattern(output_file, workingdirectory)
                         logger.debug("Expanded files: " + str(expanded_files))
                         
                     for f in expanded_files:
                         all_files.append(os.path.join(workingdirectory, f))
                  
                 du.add_files(all_files)                        
                 for f in all_files:       
                     os.remove(f)
     except:
         logger.error("Stage out of files failed.")
         self.__print_traceback()
Example #41
0
    def __get_redis_api_client(cls):
        import redis
        ''' Initialize Redis API Client     '''
        saga_url = SAGAUrl(RedisCoordinationAdaptor.BASE_URL)
        username = saga_url.username
        server = saga_url.host
        server_port = saga_url.port
        if username == None or username == "":
            redis_client = redis.Redis(host=server, port=server_port, db=0)
        else:
            redis_client = redis.Redis(host=server,
                                       port=server_port,
                                       password=username,
                                       db=0)

        try:
            redis_client.ping()
        except:
            logger.error("Please start Redis server!")
            raise Exception("Please start Redis server!")
        return redis_client
Example #42
0
    def copy_pd_to_url(self, pd, local_url, remote_url):

        if not remote_url.startswith("file://") and not remote_url.startswith(
                "/"):
            logger.error("Only local URLs supported")
            return

        result = urlparse.urlparse(remote_url)
        path = result.path
        # create directory
        try:
            os.makedirs(path)
        except:
            logger.debug("Directory: %s already exists." % path)

        base_dir = self.__get_pd_path(pd.id)
        for filename in self.__webhdfs.listdir(base_dir):
            file_url = local_url + "/" + filename
            file_remote_url = remote_url + "/" + filename
            logger.debug("GET " + file_url + " to " + file_remote_url)
            self.__webhdfs.copyToLocal(file_url, file_remote_url)
Example #43
0
 def export(self, target_url):
     """ simple implementation of export: 
             copies file from first pilot data to local machine
     """
     if self.get_state()!=State.Running:
         self.wait()
     
     if len(self.pilot_data) > 0:
         # Search for PD that is close to local machine
         for pd in self.pilot_data:
             pd_domain = tldextract.extract(pd.service_url).domain
             local_domain = tldextract.extract(socket.getfqdn()).domain
             logger.debug("Export to %s... checking PD at: %s"%(local_domain, pd_domain))
             if pd_domain == local_domain:
                 logger.debug("Export from: %s"%(pd_domain))
                 pd.export_du(self, target_url)
                 return
         # No PD found. Utilize default PD
         logger.debug("Export from random PD")
         self.pilot_data[0].export_du(self, target_url)
     else:
         logger.error("No Pilot Data for PD found")
Example #44
0
 def add_subjob(self, jd, job_url, job_id):
     logger.debug("Stage input files for sub-job")
     if jd.attribute_exists ("filetransfer"):
         try:
             self.__stage_files(jd.filetransfer, self.__get_subjob_working_dir(job_id))
         except:
             logger.error("File Stagein failed. Is Paramiko installed?")
     logger.debug("add subjob to queue of PJ: " + str(self.pilot_url))        
     for i in range(0,3):
         try:
             logger.debug("create dictionary for job description. Job-URL: " + job_url)
             # put job description attributes to Redis
             job_dict = {}
             #to accomendate current bug in bliss (Number of processes is not returned from list attributes)
             job_dict["NumberOfProcesses"] = "1" 
             attributes = jd.list_attributes()   
             logger.debug("SJ Attributes: " + str(attributes))             
             for i in attributes:          
                     if jd.attribute_is_vector(i):
                         #logger.debug("Add attribute: " + str(i) + " Value: " + str(jd.get_vector_attribute(i)))
                         vector_attr = []
                         for j in jd.get_vector_attribute(i):
                             vector_attr.append(j)
                         job_dict[i]=vector_attr
                     else:
                         #logger.debug("Add attribute: " + str(i) + " Value: " + jd.get_attribute(i))
                         job_dict[i] = jd.get_attribute(i)
             
             job_dict["state"] = str(Unknown)
             job_dict["job-id"] = str(job_id)
             
             #logger.debug("update job description at communication & coordination sub-system")
             self.coordination.set_job(job_url, job_dict)                                                
             self.coordination.queue_job(self.pilot_url, job_url)
             break
         except:
             traceback.print_exc(file=sys.stdout)
             time.sleep(2)
Example #45
0
 def __stage_files(self, filetransfers, target_url):
     logger.debug("Stage: %s to %s" % (filetransfers, target_url))
     if filetransfers == None:
         return
     if self.__filemanager:
         self.__filemanager.create_remote_directory(target_url)
     for i in filetransfers:
         source_file = i
         if i.find(">") > 0:
             source_file = i[:i.find(">")].strip()
         if source_file.startswith(
                 "ssh://") == False and source_file.startswith(
                     "go://") == False:
             logger.error(
                 "Staging of file: %s not supported. Please use URL in form ssh://<filename>"
                 % source_file)
             continue
         target_url_full = os.path.join(target_url,
                                        os.path.basename(source_file))
         logger.debug("Stage: %s to %s" % (source_file, target_url_full))
         #self.__third_party_transfer(source_file, target_url_full)
         if self.__filemanager:
             self.__filemanager.transfer(source_file, target_url_full)
Example #46
0
    def __init__(self, redis_url):
        '''
        Constructor
        '''
        server_port=6379
        self.redis_url=redis_url  
        self.password=None  
        start_index =  self.redis_url.find(REDIS_URL_SCHEME)+len(REDIS_URL_SCHEME)
        server_and_port = self.redis_url[start_index:]
        password_end = server_and_port.find("@") 
        # parse out password
        if password_end != -1:
            self.password = server_and_port[:password_end]
            start_index=password_end
            server_and_port= server_and_port[(password_end+1):]
            
        # port and hostname
        if server_and_port.find(":")==-1:
            server=server_and_port
            server_port = REDIS_SERVER_PORT
        else:
            server = server_and_port.split(":")[0]
            server_port = int(server_and_port.split(":")[1])
        
        logger.debug("Connect to Redis: " + server + " Port: " + str(server_port))
        
        if self.password==None:
            self.redis_client = redis.Redis(host=server, port=server_port, db=0)
        else:
            self.redis_client = redis.Redis(host=server, port=server_port, password=self.password, db=0)

        self.pipe = self.redis_client.pipeline()
        try:
            self.redis_client.ping()
        except:
            logger.error("Please start Redis server!")
            raise Exception("Please start Redis server!")
Example #47
0
 def create_remote_directory(self, target_url):
     result = urlparse.urlparse(target_url)
     target_host = result.hostname
     target_path = result.path
     target_user = result.username
     try:
         if not self.__is_remote_directory(target_url):
             client = paramiko.SSHClient()
             client.load_system_host_keys()
             client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
             logger.debug("Create directory at: %s" % (target_host))
             client.connect(target_host, username=target_user)
             sftp = client.open_sftp()
             sftp.mkdir(target_path)
             sftp.close()
             client.close()
             return True
     except KeyboardInterrupt:
         raise KeyboardInterrupt
     except:
         logger.error("Error creating directory: " + str(target_path) +
                      " at: " + str(target_host))
         self.__print_traceback()
     return False
Example #48
0
    def export(self, target_url):
        """ simple implementation of export: 
                copies file from first pilot data to local machine
        """
        if self.get_state() != State.Running:
            self.wait()

        if len(self.pilot_data) > 0:
            # Search for PD that is close to local machine
            local_hostname = socket.getfqdn()
            max_score = 0
            best_pd = None
            for pd in self.pilot_data:
                pd_host = SAGAUrl(pd.service_url).host
                pd_score = difflib.SequenceMatcher(a=pd_host,
                                                   b=local_hostname).ratio()
                logger.debug(
                    "Export locality compute score: Localhost: %s PD at: %s Score: %s"
                    % (local_hostname, pd_host, pd_score))
                if pd_score > max_score:
                    best_pd = pd
                    max_score = pd_score

                #pd_domain = tldextract.extract(pd.service_url).domain
                #local_domain = tldextract.extract(socket.getfqdn()).domain

            if best_pd != None:
                logger.debug("Export from: %s" % (best_pd.service_url))
                best_pd.export_du(self, target_url)
                return

            # No PD found. Utilize default PD
            logger.debug("Export from random PD")
            self.pilot_data[0].export_du(self, target_url)
        else:
            logger.error("No Pilot Data for PD found")
Example #49
0
    def __init_coordination(self, coordination_url):

        bigjob_coordination = None
        if (coordination_url.startswith("advert://")
                or coordination_url.startswith("sqlasyncadvert://")):
            try:
                from coordination.bigjob_coordination_advert import bigjob_coordination
                logger.debug("Utilizing ADVERT Backend")
            except:
                logger.error("Advert Backend could not be loaded")
        elif (coordination_url.startswith("redis://")):
            try:
                from coordination.bigjob_coordination_redis import bigjob_coordination
                logger.debug("Utilizing Redis Backend")
            except:
                logger.error("Error loading pyredis.")
                self.__print_traceback()
        elif (coordination_url.startswith("tcp://")):
            try:
                from coordination.bigjob_coordination_zmq import bigjob_coordination
                logger.debug("Utilizing ZMQ Backend")
            except:
                logger.error(
                    "ZMQ Backend not found. Please install ZeroMQ (http://www.zeromq.org/intro:get-the-software) and "
                    + "PYZMQ (http://zeromq.github.com/pyzmq/)")
        else:
            logger.error("No suitable coordination backend found.")

        # check whether coordination subsystem could be initialized
        if bigjob_coordination == None:
            raise BigJobError(
                "Could not initialize coordination subsystem (Redis)")

        logger.debug("Parsing URL: " + coordination_url)
        scheme, username, password, host, port, dbtype = self.__parse_url(
            coordination_url)

        if port == -1:
            port = None
        coordination = bigjob_coordination(server=host,
                                           server_port=port,
                                           username=username,
                                           password=password,
                                           dbtype=dbtype,
                                           url_prefix=scheme)
        return coordination
Example #50
0
 def create_remote_directory(self, target_url):
     result = urlparse.urlparse(target_url)
     target_host = result.hostname
     target_path = result.path
     target_user = result.username
     try:
         if not self.__is_remote_directory(target_url):
             client = paramiko.SSHClient()
             client.load_system_host_keys()
             client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
             logger.debug("Create directory at: %s"%(target_host))
             client.connect(target_host,  username=target_user)
             sftp = client.open_sftp()  
             sftp.mkdir(target_path)
             sftp.close()
             client.close()
             return True
     except KeyboardInterrupt:
         raise KeyboardInterrupt
     except:
         logger.error("Error creating directory: " + str(target_path) 
                      + " at: " + str(target_host))
         self.__print_traceback()
     return False
Example #51
0
    def __init__(self, args):
        
        self.coordination_url = args[1]
        # objects to store running jobs and processes
        self.jobs = []
        self.processes = {}
        self.freenodes = []
        self.busynodes = []
        self.restarted = {}

        # read config file
        conf_file = os.path.dirname(os.path.abspath( __file__ )) + "/../" + CONFIG_FILE
        if not os.path.exists(conf_file):
            conf_file = os.path.join(sys.prefix, CONFIG_FILE)
        logging.debug ("read configfile: " + conf_file)
        config = ConfigParser.ConfigParser()
        config.read(conf_file)
        default_dict = config.defaults()        
        self.CPR=False
        if default_dict.has_key("cpr"):
            self.CPR = default_dict["cpr"]
        self.SHELL="/bin/bash"
        if default_dict.has_key("shell"):
            self.SHELL=default_dict["shell"]
        self.MPIRUN="mpirun"
        # On TACC resources the default MPICH is 
        # linked under mpirun_rsh
        if default_dict.has_key("mpirun"):
            self.MPIRUN=default_dict["mpirun"]
        self.OUTPUT_TAR=False
        if default_dict.has_key("create_output_tar"):
            self.OUTPUT_TAR=eval(default_dict["create_output_tar"])
            logger.debug("Create output tar: %r", self.OUTPUT_TAR)
        
        self.LAUNCH_METHOD="ssh"                    
        if default_dict.has_key("launch_method"):
            self.LAUNCH_METHOD=self.__get_launch_method(default_dict["launch_method"])
        
        logging.debug("Launch Method: " + self.LAUNCH_METHOD + " mpi: " + self.MPIRUN + " shell: " + self.SHELL)
        
        # init rms (SGE/PBS)
        self.init_rms()
        self.failed_polls = 0
        
        ##############################################################################
        # initialization of coordination and communication subsystem
        # Redis initialization
        self.base_url = args[2]
        self.cds_queue_url = None
        if len(args)==4:
            self.cds_queue_url = args[3]
        logger.debug("External queue: " + str(self.cds_queue_url))
        self.id = self.__get_bj_id(self.base_url)
        logger.debug("BigJob Agent arguments: " + str(args))
        logger.debug("Initialize C&C subsystem to pilot-url: " + self.base_url)
        logger.debug("BigJob ID: %s"%self.id)
        
        # create bj directory
        self.work_dir = os.getcwd()
        if self.work_dir.find(self.id)==-1: # working directory already contains BJ id
            self.bj_dir = os.path.join(os.getcwd(), self.id)
            logger.debug("Agent working directory: %s"%self.bj_dir)
            try:
                os.makedirs(self.bj_dir)
            except:
                logger.debug("Directory already exists.")
        else:
            self.bj_dir = os.getcwd()
        
        os.chdir(self.bj_dir)
        
        if(self.coordination_url.startswith("advert://") or self.coordination_url.startswith("sqlasyncadvert://")):
            try:
                from coordination.bigjob_coordination_advert import bigjob_coordination
                logging.debug("Utilizing ADVERT Backend: " + self.coordination_url)
            except:
                logger.error("Advert Backend could not be loaded")
                exc_type, exc_value, exc_traceback = sys.exc_info()
                traceback.print_exc(file=sys.stderr)
                traceback.print_tb(exc_traceback, file=sys.stderr)
        elif (self.coordination_url.startswith("redis://")):
            try:
                from coordination.bigjob_coordination_redis import bigjob_coordination      
                logger.debug("Utilizing Redis Backend: " + self.coordination_url + ". Please make sure Redis server is configured in bigjob_coordination_redis.py")
            except:
                logger.error("Error loading pyredis.")
        elif (self.coordination_url.startswith("tcp://")):
            try:
                from coordination.bigjob_coordination_zmq import bigjob_coordination
                logger.debug("Utilizing ZMQ Backend")
            except:
                logger.error("ZMQ Backend not found. Please install ZeroMQ (http://www.zeromq.org/intro:get-the-software) and " 
                      +"PYZMQ (http://zeromq.github.com/pyzmq/)")

        ###
        # Initiate coordination sub-system of both BJ agent and Pilot Data
        self.coordination = bigjob_coordination(server_connect_url=self.coordination_url)
        try:
            # initialize coordination subsystem of pilot data
            self.pilot_data_service = PilotDataService(coordination_url=self.coordination_url)
        except:
            logger.warn("Pilot-Data could not be initialized.")
            
        # update state of pilot job to running
        logger.debug("set state to : " +  str(bigjob.state.Running))
        self.coordination.set_pilot_state(self.base_url, str(bigjob.state.Running), False)
        self.pilot_description = self.coordination.get_pilot_description(self.base_url)
        
        ##############################################################################
        # start background thread for polling new jobs and monitoring current jobs
        self.resource_lock=threading.RLock()
        self.threadpool = ThreadPool(THREAD_POOL_SIZE)
        
        self.launcher_thread=threading.Thread(target=self.dequeue_new_jobs)
        self.launcher_thread.start()
        
        self.monitoring_thread=threading.Thread(target=self.start_background_thread)
        self.monitoring_thread.start()
Example #52
0
    def execute_job(self, job_url, job_dict):
        """ obtain job attributes from c&c and execute process """
        state = str(job_dict["state"])

        if (state == str(bigjob.state.Unknown)
                or state == str(bigjob.state.New)):
            try:
                #job_dict["state"]=str(saga.job.New)
                job_id = job_dict["job-id"]
                logger.debug("Start job id %s specification %s: " %
                             (job_id, str(job_dict)))
                numberofprocesses = "1"
                try:
                    if (job_dict.has_key("NumberOfProcesses") == True):
                        numberofprocesses = job_dict["NumberOfProcesses"]
                except:
                    pass  # ignore in particular if Bliss is used

                spmdvariation = "single"
                try:
                    if (job_dict.has_key("SPMDVariation") == True):
                        spmdvariation = job_dict["SPMDVariation"]
                except:
                    pass  # ignore in particular if Bliss is used

                arguments = ""
                if (job_dict.has_key("Arguments") == True):
                    arguments_raw = job_dict['Arguments']
                    if type(arguments_raw) == types.ListType:
                        arguments_list = arguments_raw
                    else:
                        arguments_list = eval(job_dict["Arguments"])
                    for i in arguments_list:
                        arguments = arguments + " " + str(i)

                environment = os.environ
                envi = ""
                self.number_subjobs = 1
                if (job_dict.has_key("Environment") == True):
                    env_raw = job_dict['Environment']
                    if type(env_raw) == types.ListType:
                        env_list = env_raw
                    else:
                        env_list = eval(job_dict["Environment"])

                    logger.debug("Environment: " + str(env_list))
                    for i in env_list:
                        logger.debug("Eval " + i)
                        # Hack for conduction experiments on Kraken
                        # Kraken specific support for running n sub-jobs at a time
                        if i.startswith("NUMBER_SUBJOBS"):
                            self.number_subjobs = int(i.split("=")[1].strip())
                            logger.debug("NUMBER_SUBJOBS: " +
                                         str(self.number_subjobs))
                        else:
                            envi_1 = "export " + i + "; "
                            envi = envi + envi_1
                            logger.debug(envi)

                executable = job_dict["Executable"]
                executable = self.__expand_directory(executable)

                workingdirectory = os.path.join(os.getcwd(), job_id)
                if (job_dict.has_key("WorkingDirectory") == True):
                    workingdirectory = job_dict["WorkingDirectory"]
                    workingdirectory = self.__expand_directory(
                        workingdirectory)
                try:
                    os.makedirs(workingdirectory)
                except:
                    logger.debug("Directory %s already exists." %
                                 workingdirectory)
                logging.debug("Sub-Job: %s, Working_directory: %s" %
                              (job_id, workingdirectory))

                output = "stdout"
                if (job_dict.has_key("Output") == True):
                    output = job_dict["Output"]
                if not os.path.isabs(output):
                    output = os.path.join(workingdirectory, output)

                error = os.path.join(workingdirectory, "stderr")
                if (job_dict.has_key("Error") == True):
                    error = job_dict["Error"]
                if not os.path.isabs(error):
                    error = os.path.join(workingdirectory, error)

                # append job to job list
                self.jobs.append(job_url)

                #######################################################################################################
                # special setup for MPI NAMD jobs
                machinefile = self.allocate_nodes(job_dict)
                host = "localhost"
                try:
                    machine_file_handler = open(machinefile, "r")
                    node = machine_file_handler.readlines()
                    machine_file_handler.close()
                    host = node[0].strip()
                except:
                    pass

                if (machinefile == None):
                    logger.debug("Not enough resources to run: " + job_url)
                    self.coordination.set_job_state(job_url,
                                                    str(bigjob.state.New))
                    self.coordination.queue_job(self.base_url, job_url)
                    return  # job cannot be run at the moment

                #######################################################################################################
                # File Stage-In of dependent data units
                if job_dict.has_key("InputData"):
                    self.coordination.set_job_state(job_url,
                                                    str(bigjob.state.Staging))
                    self.__stage_in_data_units(eval(job_dict["InputData"]),
                                               workingdirectory)

                # File Stage-In - Move pilot-level files to working directory of sub-job
                if self.pilot_description != None:
                    try:
                        if self.pilot_description.has_key("description"):
                            file_list = eval(
                                self.pilot_description["description"])
                            if file_list != None and len(file_list) > 0:
                                logger.debug("Copy %d files to SJ work dir" %
                                             len(file_list) > 0)
                                for i in file_list:
                                    logger.debug("Process file: %s" % i)
                                    if i.find(">") > 0:
                                        base_filename = os.path.basename(
                                            i[:i.index(">")].strip())
                                        if environment.has_key(
                                                "_CONDOR_SCRATCH_DIR"):
                                            source_filename = os.path.join(
                                                environment[
                                                    "_CONDOR_SCRATCH_DIR"],
                                                base_filename)
                                        else:
                                            source_filename = os.path.join(
                                                self.work_dir, base_filename)
                                        target_filename = os.path.join(
                                            workingdirectory, base_filename)
                                        try:
                                            logger.debug("Copy: %s to %s" %
                                                         (source_filename,
                                                          target_filename))
                                            shutil.copyfile(
                                                source_filename,
                                                target_filename)
                                        except:
                                            logger.error(
                                                "Error copy: %s to %s" %
                                                (source_filename,
                                                 target_filename))
                    except:
                        logger.debug("Moving of stage-in files failed.")

                # create stdout/stderr file descriptors
                output_file = os.path.abspath(output)
                error_file = os.path.abspath(error)
                logger.debug("stdout: " + output_file + " stderr: " +
                             error_file)
                stdout = open(output_file, "w")
                stderr = open(error_file, "w")
                # build execution command
                if self.LAUNCH_METHOD == "aprun":
                    if (spmdvariation.lower() == "mpi"):
                        command = envi + "aprun  -n " + str(
                            numberofprocesses
                        ) + " " + executable + " " + arguments
                    else:
                        #env_strip = envi.strip()
                        #env_command = env_strip[:(len(env_strip)-1)]
                        command = envi + "aprun  -n " + str(
                            self.number_subjobs
                        ) + " -d " + numberofprocesses + " " + executable + " " + arguments

                    # MPMD Mode => all subjobs on Kraken fail because aprun returns 1 as returncode
                    #command = "aprun"
                    #for i in range(0, self.number_subjobs):
                    #    command = command +   " -d " + numberofprocesses + " " + executable + " " + arguments
                    #    # + " 1 > "+ str(i)+ "-out.txt " + " 2 > "+ str(i)+ "-err.txt"
                    #    if i != self.number_subjobs-1:
                    #        command = command + " : "
                elif self.LAUNCH_METHOD == "ibrun" and spmdvariation.lower(
                ) == "mpi":
                    # Non MPI launch is handled via standard SSH
                    command = envi + "mpirun_rsh   -np " + str(
                        numberofprocesses
                    ) + " -hostfile " + machinefile + "  `build_env.pl` " + executable + " " + arguments
                elif (spmdvariation.lower() != "mpi"):
                    command = envi + executable + " " + arguments
                    # In particular for Condor - if executable is staged x flag is not set
                    #command ="chmod +x " + executable +";export PATH=$PATH:" + workingdirectory + ";" +command
                else:
                    # Environment variables need to be handled later!
                    command = envi + executable + " " + arguments

                # add working directory and ssh command
                if self.LAUNCH_METHOD == "aprun" or (
                        self.LAUNCH_METHOD == "ibrun"
                        and spmdvariation.lower() == "mpi"):
                    command = "cd " + workingdirectory + "; " + command
                elif self.LAUNCH_METHOD == "local":
                    command = "cd " + workingdirectory + "; " + command
                else:  # ssh launch is default
                    if (spmdvariation.lower() == "mpi"):
                        command = "cd " + workingdirectory + "; " + envi + self.MPIRUN + " -np " + numberofprocesses + " -machinefile " + machinefile + " " + executable + " " + arguments
                    elif host == "localhost":
                        command = "cd " + workingdirectory + "; " + command
                    else:
                        command = "ssh  " + host + " \'cd " + workingdirectory + "; " + command + "\'"

                # start application process
                shell = self.SHELL
                logger.debug("execute: " + command + " in " +
                             workingdirectory + " from: " +
                             str(socket.gethostname()) + " (Shell: " + shell +
                             ")")
                # bash works fine for launching on QB but fails for Abe :-(
                p = subprocess.Popen(args=command,
                                     executable=shell,
                                     stderr=stderr,
                                     stdout=stdout,
                                     cwd=workingdirectory,
                                     env=environment,
                                     shell=True)
                logger.debug("started " + command)
                self.processes[job_url] = p
                self.coordination.set_job_state(job_url,
                                                str(bigjob.state.Running))
            except:
                traceback.print_exc(file=sys.stderr)
Example #53
0
    def __init__(self, args):

        self.coordination_url = args[1]
        # objects to store running jobs and processes
        self.jobs = []
        self.processes = {}
        self.freenodes = []
        self.busynodes = []
        self.restarted = {}

        # read config file
        conf_file = os.path.dirname(
            os.path.abspath(__file__)) + "/../" + CONFIG_FILE
        if not os.path.exists(conf_file):
            conf_file = os.path.join(sys.prefix, CONFIG_FILE)
        logging.debug("read configfile: " + conf_file)
        config = ConfigParser.ConfigParser()
        config.read(conf_file)
        default_dict = config.defaults()
        self.CPR = False
        if default_dict.has_key("cpr"):
            self.CPR = default_dict["cpr"]
        self.SHELL = "/bin/bash"
        if default_dict.has_key("shell"):
            self.SHELL = default_dict["shell"]
        self.MPIRUN = "mpirun"
        # On TACC resources the default MPICH is
        # linked under mpirun_rsh
        if default_dict.has_key("mpirun"):
            self.MPIRUN = default_dict["mpirun"]

        if default_dict.has_key("number_executor_threads"):
            THREAD_POOL_SIZE = int(default_dict["number_executor_threads"])

        self.OUTPUT_TAR = False
        if default_dict.has_key("create_output_tar"):
            self.OUTPUT_TAR = eval(default_dict["create_output_tar"])
            logger.debug("Create output tar: %r", self.OUTPUT_TAR)

        self.failed_polls = 0

        ##############################################################################
        # initialization of coordination and communication subsystem
        # Redis initialization
        self.base_url = args[2]
        self.cds_queue_url = None
        if len(args) == 4:
            self.cds_queue_url = args[3]
        logger.debug("External queue: " + str(self.cds_queue_url))
        self.id = self.__get_bj_id(self.base_url)
        logger.debug("BigJob Agent arguments: " + str(args))
        logger.debug("Initialize C&C subsystem to pilot-url: " + self.base_url)
        logger.debug("BigJob ID: %s" % self.id)

        # create bj directory
        self.work_dir = os.getcwd()
        if self.work_dir.find(
                self.id) == -1:  # working directory already contains BJ id
            self.bj_dir = os.path.join(os.getcwd(), self.id)
            logger.debug("Agent working directory: %s" % self.bj_dir)
            try:
                os.makedirs(self.bj_dir)
            except:
                logger.debug("Directory already exists.")
        else:
            self.bj_dir = os.getcwd()

        os.chdir(self.bj_dir)

        if (self.coordination_url.startswith("advert://")
                or self.coordination_url.startswith("sqlasyncadvert://")):
            try:
                from coordination.bigjob_coordination_advert import bigjob_coordination
                logging.debug("Utilizing ADVERT Backend: " +
                              self.coordination_url)
            except:
                logger.error("Advert Backend could not be loaded")
                exc_type, exc_value, exc_traceback = sys.exc_info()
                traceback.print_exc(file=sys.stderr)
                traceback.print_tb(exc_traceback, file=sys.stderr)
        elif (self.coordination_url.startswith("redis://")):
            try:
                from coordination.bigjob_coordination_redis import bigjob_coordination
                logger.debug("Utilizing Redis Backend: " +
                             self.coordination_url + ".")
            except:
                logger.error(
                    "Error loading pyredis. Check configuration in bigjob_coordination_redis.py."
                )
        elif (self.coordination_url.startswith("tcp://")):
            try:
                from coordination.bigjob_coordination_zmq import bigjob_coordination
                logger.debug("Utilizing ZMQ Backend")
            except:
                logger.error(
                    "ZMQ Backend not found. Please install ZeroMQ (http://www.zeromq.org/intro:get-the-software) and "
                    + "PYZMQ (http://zeromq.github.com/pyzmq/)")

        ###
        # Initiate coordination sub-system of both BJ agent and Pilot Data
        self.coordination = bigjob_coordination(
            server_connect_url=self.coordination_url)
        try:
            # initialize coordination subsystem of pilot data
            self.pilot_data_service = PilotDataService(
                coordination_url=self.coordination_url)
        except:
            logger.warn("Pilot-Data could not be initialized.")

        # update state of pilot job to running
        logger.debug("set state to : " + str(bigjob.state.Running))
        self.coordination.set_pilot_state(self.base_url,
                                          str(bigjob.state.Running), False)
        self.pilot_description = self.coordination.get_pilot_description(
            self.base_url)
        try:
            self.pilot_description = ast.literal_eval(self.pilot_description)
        except:
            logger.warn("Unable to parse pilot description")
            self.pilot_description = None

        ############################################################################
        # Detect launch method
        self.LAUNCH_METHOD = "ssh"
        if default_dict.has_key("launch_method"):
            self.LAUNCH_METHOD = default_dict["launch_method"]

        self.LAUNCH_METHOD = self.__get_launch_method(self.LAUNCH_METHOD)

        logging.debug("Launch Method: " + self.LAUNCH_METHOD + " mpi: " +
                      self.MPIRUN + " shell: " + self.SHELL)

        # init rms (SGE/PBS)
        self.init_rms()

        ##############################################################################
        # start background thread for polling new jobs and monitoring current jobs
        # check whether user requested a certain threadpool size
        if self.pilot_description != None and self.pilot_description.has_key(
                "number_executor_threads"):
            THREAD_POOL_SIZE = int(
                self.pilot_description["number_executor_threads"])
        logger.debug("Creating executor thread pool of size: %d" %
                     (THREAD_POOL_SIZE))
        self.resource_lock = threading.RLock()
        self.threadpool = ThreadPool(THREAD_POOL_SIZE)

        self.launcher_thread = threading.Thread(target=self.dequeue_new_jobs)
        self.launcher_thread.start()

        self.monitoring_thread = threading.Thread(
            target=self.start_background_thread)
        self.monitoring_thread.start()