Esempio n. 1
0
    def __init__(self, conf_file="/default/conf/file/"):
        self.conf_file = conf_file
        darelogger.info('Loading conf file %s' % conf_file)
        if not os.path.exists(self.conf_file):
            raise RuntimeError("Cannot find %s " % self.conf_file)

        #parse job conf file
        self.config = ConfigParser.ConfigParser()
        self.config.read(self.conf_file)
Esempio n. 2
0
 def check_to_start_step(self, step_id):
     flags = []
     darelogger.info(self.workflow.step_units_repo[step_id].UnitInfo['start_after_steps'])
     if self.workflow.step_units_repo[step_id].get_status() == StepUnitStates.New:
         for dep_step_id in self.workflow.step_units_repo[step_id].UnitInfo['start_after_steps']:
             if self.workflow.step_units_repo[dep_step_id].get_status() != StepUnitStates.Done:
                 flags.append(False)
             darelogger.info(self.workflow.step_units_repo[dep_step_id].get_status())
     return False if False in flags else True
Esempio n. 3
0
    def __init__(self, conf_file="/default/conf/file/"):
        self.conf_file = conf_file
        darelogger.info('Loading conf file %s' % conf_file)
        if not os.path.exists(self.conf_file):
            raise RuntimeError("Cannot find %s " % self.conf_file)

        #parse job conf file
        self.config = configparser.ConfigParser()
        self.config.read(self.conf_file)
Esempio n. 4
0
    def create_static_workflow(self):
        self.process_config_file()
        darelogger.info("Done Reading DARE Config File")

        self.prepare_pilot_units()

        self.prepare_step_units()
        self.prepare_compute_units()

        self.prepare_data_units()
Esempio n. 5
0
    def start_step(self):
        self.step_start_lock.acquire()
        step_id = self.start_thread_step_id
        self.step_start_lock.release()

        while(1):
            darelogger.info(" Checking to start step %s " % step_id)
            if self.check_to_start_step(step_id):
                self.run_step(step_id)
                break
            else:
                darelogger.info(" Cannot start this step %s sleeping..." % step_id)
                time.sleep(10)
Esempio n. 6
0
 def check_to_start_step(self, step_id):
     flags = []
     darelogger.info(self.workflow.step_units_repo[step_id].
                     UnitInfo['start_after_steps'])
     if self.workflow.step_units_repo[step_id].get_status(
     ) == StepUnitStates.New:
         for dep_step_id in self.workflow.step_units_repo[step_id].UnitInfo[
                 'start_after_steps']:
             if self.workflow.step_units_repo[dep_step_id].get_status(
             ) != StepUnitStates.Done:
                 flags.append(False)
             darelogger.info(
                 self.workflow.step_units_repo[dep_step_id].get_status())
     return False if False in flags else True
Esempio n. 7
0
    def start_step(self):
        self.step_start_lock.acquire()
        step_id = self.start_thread_step_id
        self.step_start_lock.release()

        while (1):
            darelogger.info(" Checking to start step %s " % step_id)
            if self.check_to_start_step(step_id):
                self.run_step(step_id)
                break
            else:
                darelogger.info(" Cannot start this step %s sleeping..." %
                                step_id)
                time.sleep(10)
Esempio n. 8
0
    def prepare_compute_units(self):
        """add prepare work dir """

        darelogger.info("Starting to prepare Compute Units ")


        for step in self.dare_conf_main['steps'].split(','):
            darelogger.info("Preparing compute Units: %s"%step)

            try:
                step_info_from_main_cfg = self.dare_conf_full.SectionDict(step.strip())
            except:
                darelogger.info("step description section not found for step %s"%step)  
                sys.exit()    

            step_cfg_file = step_info_from_main_cfg.get('step_cfg_file', 'undefined_step_file').strip()

            if step_cfg_file.lower() == 'default' or step_cfg_file.lower() == 'undefined_step_file':
                step_cfg_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'daredb', 'echo_hello.cu')    

            # check if file exists
            all_cu_confs= CfgParser(step_cfg_file) 

            cu_conf = all_cu_confs.SectionDict( step_info_from_main_cfg.get('cu_type', 'default').strip())
            #print step_cfg_file           
            for input_file in step_info_from_main_cfg.get('input_names', '').split(','):

                input_file = input_file.strip()
                cu_uuid = "cu-%s"%(uuid.uuid1(),)
                cu_working_directory = '/tmp/'  #data_unit.url
                cu_step_id  = "step-%s-%s"%(step_info_from_main_cfg.get('step_name').strip(), self.dare_id)
                # start work unit
               
                compute_unit_description = {
                        "executable": cu_conf["executable"],
                        "arguments": self.prepare_cu_arguments(input_file,cu_conf),
                        "total_core_count": 1,
                        "number_of_processes": 1,
                        #"working_directory": cu_working_directory,
                        "output":"dare-cu-stdout-"+ cu_uuid +".txt",
                        "error": "dare-cu-stderr-"+ cu_uuid +".txt",   
                        "affinity_datacenter_label": "%s-adl"%step_info_from_main_cfg.get('resource', self.dare_conf_main['used_pilots'].split(',')[0]).strip(),              
                        "affinity_machine_label": "%s-aml"%step_info_from_main_cfg.get('resource', self.dare_conf_main['used_pilots'].split(',')[0]).strip() 
                       }    

                self.compute_units_repo[cu_uuid]=compute_unit_description
                # add this cu to step
                self.step_units_repo[cu_step_id].add_cu(cu_uuid)

        darelogger.info("Done preparing compute Units ")
Esempio n. 9
0
    def start(self):
        darelogger.info("Creating Compute Engine service ")
        self.pilot_compute_service = PilotComputeService(
            coordination_url=COORDINATION_URL)
        self.pilot_data_service = PilotDataService(
            coordination_url=COORDINATION_URL)

        for compute_pilot, desc in list(
                self.workflow.compute_pilot_repo.items()):
            self.pilot_compute_service.create_pilot(
                pilot_compute_description=desc)

        for data_pilot, desc in list(self.workflow.data_pilot_repo.items()):
            self.data_pilot_service_repo.append(
                self.pilot_data_service.create_pilot(
                    pilot_data_description=desc))

        self.compute_data_service = ComputeDataServiceDecentral()
        self.compute_data_service.add_pilot_compute_service(
            self.pilot_compute_service)
        self.compute_data_service.add_pilot_data_service(
            self.pilot_data_service)

        ### run the steps
        self.step_start_lock = threading.RLock()
        self.step_run_lock = threading.RLock()

        for step_id in list(self.workflow.step_units_repo.keys()):
            darelogger.info(" Sumitted step %s " % step_id)
            self.step_start_lock.acquire()
            self.start_thread_step_id = step_id
            self.step_start_lock.release()
            self.step_threads[step_id] = threading.Thread(
                target=self.start_step)
            self.step_threads[step_id].start()

        while (1):
            count_step = [
                v.is_alive() for k, v in list(self.step_threads.items())
            ]
            darelogger.info('count_step %s' % count_step)
            if not True in count_step and len(count_step) > 0:
                break
            time.sleep(10)

        darelogger.info(" All Steps Done processing")

        self.quit(message='quit gracefully')
Esempio n. 10
0
    def start(self):         
       # try:
            from pilot import PilotComputeService, PilotDataService, ComputeDataService, State

            darelogger.info("Create Compute Engine service ")

            self.pilot_compute_service = PilotComputeService(coordination_url=COORDINATION_URL)
            self.pilot_data_service = PilotDataService()

            for compute_pilot, desc in self.workflow.compute_pilot_repo.items():
                self.compute_pilot_service_repo.append(self.pilot_compute_service.create_pilot(pilot_compute_description=desc))

            #for data_pilot, desc in self.workflow.data_pilot_repo.items():            
             #   self.data_pilot_service_repo.append(self.pilot_data_service.create_pilot(pilot_data_description=desc))
    
            self.compute_data_service = ComputeDataService()
            self.compute_data_service.add_pilot_compute_service(self.pilot_compute_service)
           # self.compute_data_service.add_pilot_data_service(self.pilot_data_service) 

            self.step_thread= {}

            ### run the steps
            self.step_start_lock=threading.RLock()
            self.step_run_lock=threading.RLock()

            for step_id in self.workflow.step_units_repo.keys():
                    darelogger.info(" Sumitted step %s "%step_id)
                    self.step_start_lock.acquire()
                    self.start_thread_step_id =step_id
                    self.step_start_lock.release()

                    self.step_thread[step_id] = threading.Thread(target=self.start_step)
                    self.step_thread[step_id].start()
                    
            while(1):     
                count_step = [v.is_alive() for k,v in self.step_thread.items()]
                darelogger.info('count_step %s'%count_step)
                if not True in count_step and len(count_step)>0:                      
                    break
                time.sleep(10)
                       
            darelogger.info(" All Steps Done processing")

            self.cancel()
Esempio n. 11
0
    def prepare_pilot_units(self):        
        darelogger.info("Starting to prepare pilot Units")
              
        pilot_config_file = self.dare_conf_main.get('pilot_config_file', 'default')

        for pilot in self.dare_conf_main['used_pilots'].split(','):
            pilot =  pilot.strip()
            compute_pilot_uuid = "compute-pilot-%s-%s"%(pilot, str(uuid.uuid1()))
            data_pilot_uuid = "compute-pilot-%s-%s"%(pilot, str(uuid.uuid1()))

            pilot_info_from_main_cfg = self.dare_conf_full.SectionDict(pilot)
 
            darelogger.info("Preparing pilot unit for  %s"%pilot)
            
            pilot_config_file = pilot_info_from_main_cfg.get('pilot_config_file', "undefined_pilot_file")
             
            if pilot_config_file.lower() == 'default' or pilot_config_file.lower() == 'undefined_pilot_file':
                pilot_config_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'daredb', 'pilot.cfg')                


            pilot_config_from_db = CfgParser(pilot_config_file)

            info_pilot = pilot_config_from_db.SectionDict(pilot)        


            # create pilot job service and initiate a pilot job
            pilot_compute_description = {
                             "service_url": info_pilot['service_url'],
                             "working_directory": info_pilot['working_directory'],
                             'affinity_datacenter_label': '%s-adl'%pilot,              
                             'affinity_machine_label': '%s-aml'%pilot ,
                             "number_of_processes":  int(pilot_info_from_main_cfg['number_of_processes']),                             
                             "walltime" : int(pilot_info_from_main_cfg['walltime'])
                            }


            self.compute_pilot_repo[compute_pilot_uuid] = pilot_compute_description


            pilot_data_description={
                                "service_url": info_pilot['data_service_url'],
                                "size": 100,   
                                "affinity_datacenter_label": pilot+'-dcl',              
                                "affinity_machine_label": pilot + '-aml'                              
                             }

            self.data_pilot_repo[data_pilot_uuid] = pilot_data_description

        darelogger.info("Done preparing Pilot Units ")
Esempio n. 12
0
    def prepare_step_units(self):        
        darelogger.info("Starting to prepare Step Units ")
        
        #TODO:: check for same names
        
        for step in self.dare_conf_main['steps'].split(','):
            darelogger.info("Preparing Step Units: %s"%step)

            try:
                step_info_from_main_cfg = self.dare_conf_full.SectionDict(step.strip())
            except:
                darelogger.info("step description section not found for step %s"%step)  
                sys.exit()    
            start_after_steps = []
            
            if step_info_from_main_cfg.get('start_after_steps'):
               start_after_steps = ["step-%s-%s"%(k.strip(),self.dare_id) for k in step_info_from_main_cfg.get('start_after_steps').split(',')]

            step_unit_uuid = "step-%s-%s"%(step_info_from_main_cfg.get('step_name').strip(), self.dare_id)
            info_steps = {
                      "step_id":step_unit_uuid,
                      "dare_web_id":self.dare_web_id ,
                      "name":step_info_from_main_cfg.get('step_name').strip(),
                      "status": StepUnitStates.New,

                      "pilot": step_info_from_main_cfg.get('pilot'),

                      "start_after_steps":start_after_steps ,
                      "compute_units":[],
                      "transfer_input_data_units":[],
                      "transfer_output_data_units":[]
                      }

            su = StepUnit()
            su.define_param(info_steps)
            self.step_units_repo[step_unit_uuid] = su


        darelogger.info("Done preparing Step Units ")
Esempio n. 13
0
    def prepare_data_units(self):                

        darelogger.info("Starting to prepare Data Units ")

        for step in self.dare_conf_main['steps'].split(','):
            darelogger.info("Preparing Data Units: %s"%step)

            try:
                step_info_from_main_cfg = self.dare_conf_full.SectionDict(step.strip())
            except:
                darelogger.info("step description section not found for step %s"%step)  
                sys.exit()    


            #print step_cfg_file 
            absolute_url_list = step_info_from_main_cfg.get('input_names', '').split(',')
       
            # Create Data Unit Description
            #   base_dir = "/Users/Sharath/workspace/projects/backups"
            #   url_list = os.listdir(base_dir)
            #   absolute_url_list = [os.path.join(base_dir, i) for i in url_list]

            du_uuid = "du-%s"%(uuid.uuid1(),)
            # make absolute paths
            du_step_id  = "step-%s-%s"%(step.strip(), self.dare_id)

            data_unit_description = {
                                      "file_urls":absolute_url_list,
                                      "affinity_datacenter_label": "eu-de-south",              
                                      "affinity_machine_label": "mymachine-1"
                                     }    

                # submit pilot data to a pilot store    

            self.data_units_repo[du_uuid]=data_unit_description
            # add this cu to step
            self.step_units_repo[du_step_id].add_input_du(du_uuid)
Esempio n. 14
0
    def run_step(self, step_id):
        #self.step_run_lock.acquire()
        #job started update status
        this_su = self.workflow.step_units_repo[step_id].UnitInfo
        self.updater.update_status(
            this_su['dare_web_id'],
            "%s in step %s" % ('Running', this_su['name']))

        darelogger.info(" Started running %s " % step_id)

        jobs = []
        job_start_times = {}
        job_states = {}
        NUMBER_JOBS = len(
            self.workflow.step_units_repo[step_id].UnitInfo['compute_units'])
        for cu_id in self.workflow.step_units_repo[step_id].UnitInfo[
                'compute_units']:
            compute_unit_desc = self.workflow.compute_units_repo[cu_id]
            input_dus = compute_unit_desc.pop('input_data_units')
            output_dus = compute_unit_desc.pop('output_data_units')
            input_data_units = []
            for du_id in input_dus:
                input_data_units.append(
                    self.compute_data_service.submit_data_unit(
                        self.workflow.data_units_repo[du_id]))
            output_data_units = []
            for du_id in output_dus:
                output_data_units.append(
                    self.compute_data_service.submit_data_unit(
                        self.workflow.data_units_repo[du_id]))

            compute_unit_desc["input_data"] = [
                du.get_url() for du in input_data_units
            ]
            compute_unit_desc["output_data"] = [{
                du.get_url(): ['std*']
            } for du in output_data_units]
            compute_unit = self.compute_data_service.submit_compute_unit(
                compute_unit_desc)

            darelogger.info("Compute Unit: Description: \n%s" %
                            (str(self.workflow.compute_units_repo[cu_id])))
            jobs.append(compute_unit)
            job_start_times[compute_unit] = time.time()
            job_states[compute_unit] = compute_unit.get_state()

        darelogger.debug(
            "************************ All Jobs submitted ************************"
        )

        while 1:
            finish_counter = 0
            result_map = {}
            for i in range(0, NUMBER_JOBS):
                old_state = job_states[jobs[i]]
                state = jobs[i].get_state()
                if state in result_map == False:
                    result_map[state] = 0
                result_map[state] = result_map.get(state, 0) + 1
                #print "counter: " + str(i) + " job: " + str(jobs[i]) + " state: " + state
                if old_state != state:
                    darelogger.debug("Job " + str(jobs[i]) +
                                     " changed from: " + old_state + " to " +
                                     state)
                if old_state != state and self.has_finished(state) == True:
                    darelogger.info("%s step Job: " %
                                    (self.workflow.step_units_repo[step_id].
                                     UnitInfo['name']) + str(jobs[i]) +
                                    " Runtime: " +
                                    str(time.time() -
                                        job_start_times[jobs[i]]) + " s.")
                if self.has_finished(state) == True:
                    finish_counter = finish_counter + 1
                job_states[jobs[i]] = state

            darelogger.debug("Current states: " + str(result_map))
            time.sleep(5)
            if finish_counter == NUMBER_JOBS:
                break

        self.workflow.step_units_repo[step_id].set_status(StepUnitStates.Done)

        #self.compute_data_service.wait()
        darelogger.debug(" Compute jobs for step %s complete" % step_id)

        #runtime = time.time()-starttime

        #all jobs done update status
        self.updater.update_status(this_su['dare_web_id'],
                                   "%s is Done" % this_su['name'])
Esempio n. 15
0
    def run_step(self, step_id):
        #self.step_run_lock.acquire()
        #job started update status
        this_su = self.workflow.step_units_repo[step_id].UnitInfo
        self.updater.update_status(this_su['dare_web_id'], "%s in step %s" % ('Running',  this_su['name']))

        darelogger.info(" Started running %s " % step_id)

        jobs = []
        job_start_times = {}
        job_states = {}
        NUMBER_JOBS = len(self.workflow.step_units_repo[step_id].UnitInfo['compute_units'])
        for cu_id in self.workflow.step_units_repo[step_id].UnitInfo['compute_units']:
            compute_unit_desc = self.workflow.compute_units_repo[cu_id]
            input_dus = compute_unit_desc.pop('input_data_units')
            output_dus = compute_unit_desc.pop('output_data_units')
            input_data_units = []
            for du_id in input_dus:
                input_data_units.append(self.compute_data_service.submit_data_unit(self.workflow.data_units_repo[du_id]))
            output_data_units = []
            for du_id in output_dus:
                output_data_units.append(self.compute_data_service.submit_data_unit(self.workflow.data_units_repo[du_id]))

            compute_unit_desc["input_data"] = [du.get_url() for du in input_data_units]
            compute_unit_desc["output_data"] = [{du.get_url(): ['std*']} for du in output_data_units]
            compute_unit = self.compute_data_service.submit_compute_unit(compute_unit_desc)

            darelogger.info("Compute Unit: Description: \n%s" % (str(self.workflow.compute_units_repo[cu_id])))
            jobs.append(compute_unit)
            job_start_times[compute_unit] = time.time()
            job_states[compute_unit] = compute_unit.get_state()

        darelogger.debug("************************ All Jobs submitted ************************")

        while 1:
            finish_counter = 0
            result_map = {}
            for i in range(0, NUMBER_JOBS):
                old_state = job_states[jobs[i]]
                state = jobs[i].get_state()
                if  state in result_map == False:
                    result_map[state] = 0
                result_map[state] = result_map.get(state, 0) + 1
                #print "counter: " + str(i) + " job: " + str(jobs[i]) + " state: " + state
                if old_state != state:
                    darelogger.debug("Job " + str(jobs[i]) + " changed from: " + old_state + " to " + state)
                if old_state != state and self.has_finished(state) == True:
                    darelogger.info("%s step Job: " % (self.workflow.step_units_repo[step_id].UnitInfo['name']) + str(jobs[i]) + " Runtime: " + str(time.time() - job_start_times[jobs[i]]) + " s.")
                if self.has_finished(state) == True:
                    finish_counter = finish_counter + 1
                job_states[jobs[i]] = state

            darelogger.debug("Current states: " + str(result_map))
            time.sleep(5)
            if finish_counter == NUMBER_JOBS:
                break

        self.workflow.step_units_repo[step_id].set_status(StepUnitStates.Done)

        #self.compute_data_service.wait()
        darelogger.debug(" Compute jobs for step %s complete" % step_id)

        #runtime = time.time()-starttime

        #all jobs done update status
        self.updater.update_status(this_su['dare_web_id'], "%s is Done" % this_su['name'])