def __init__(self, conf_file="/default/conf/file/"): self.conf_file = conf_file darelogger.info('Loading conf file %s' % conf_file) if not os.path.exists(self.conf_file): raise RuntimeError("Cannot find %s " % self.conf_file) #parse job conf file self.config = ConfigParser.ConfigParser() self.config.read(self.conf_file)
def check_to_start_step(self, step_id): flags = [] darelogger.info(self.workflow.step_units_repo[step_id].UnitInfo['start_after_steps']) if self.workflow.step_units_repo[step_id].get_status() == StepUnitStates.New: for dep_step_id in self.workflow.step_units_repo[step_id].UnitInfo['start_after_steps']: if self.workflow.step_units_repo[dep_step_id].get_status() != StepUnitStates.Done: flags.append(False) darelogger.info(self.workflow.step_units_repo[dep_step_id].get_status()) return False if False in flags else True
def __init__(self, conf_file="/default/conf/file/"): self.conf_file = conf_file darelogger.info('Loading conf file %s' % conf_file) if not os.path.exists(self.conf_file): raise RuntimeError("Cannot find %s " % self.conf_file) #parse job conf file self.config = configparser.ConfigParser() self.config.read(self.conf_file)
def create_static_workflow(self): self.process_config_file() darelogger.info("Done Reading DARE Config File") self.prepare_pilot_units() self.prepare_step_units() self.prepare_compute_units() self.prepare_data_units()
def start_step(self): self.step_start_lock.acquire() step_id = self.start_thread_step_id self.step_start_lock.release() while(1): darelogger.info(" Checking to start step %s " % step_id) if self.check_to_start_step(step_id): self.run_step(step_id) break else: darelogger.info(" Cannot start this step %s sleeping..." % step_id) time.sleep(10)
def check_to_start_step(self, step_id): flags = [] darelogger.info(self.workflow.step_units_repo[step_id]. UnitInfo['start_after_steps']) if self.workflow.step_units_repo[step_id].get_status( ) == StepUnitStates.New: for dep_step_id in self.workflow.step_units_repo[step_id].UnitInfo[ 'start_after_steps']: if self.workflow.step_units_repo[dep_step_id].get_status( ) != StepUnitStates.Done: flags.append(False) darelogger.info( self.workflow.step_units_repo[dep_step_id].get_status()) return False if False in flags else True
def start_step(self): self.step_start_lock.acquire() step_id = self.start_thread_step_id self.step_start_lock.release() while (1): darelogger.info(" Checking to start step %s " % step_id) if self.check_to_start_step(step_id): self.run_step(step_id) break else: darelogger.info(" Cannot start this step %s sleeping..." % step_id) time.sleep(10)
def prepare_compute_units(self): """add prepare work dir """ darelogger.info("Starting to prepare Compute Units ") for step in self.dare_conf_main['steps'].split(','): darelogger.info("Preparing compute Units: %s"%step) try: step_info_from_main_cfg = self.dare_conf_full.SectionDict(step.strip()) except: darelogger.info("step description section not found for step %s"%step) sys.exit() step_cfg_file = step_info_from_main_cfg.get('step_cfg_file', 'undefined_step_file').strip() if step_cfg_file.lower() == 'default' or step_cfg_file.lower() == 'undefined_step_file': step_cfg_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'daredb', 'echo_hello.cu') # check if file exists all_cu_confs= CfgParser(step_cfg_file) cu_conf = all_cu_confs.SectionDict( step_info_from_main_cfg.get('cu_type', 'default').strip()) #print step_cfg_file for input_file in step_info_from_main_cfg.get('input_names', '').split(','): input_file = input_file.strip() cu_uuid = "cu-%s"%(uuid.uuid1(),) cu_working_directory = '/tmp/' #data_unit.url cu_step_id = "step-%s-%s"%(step_info_from_main_cfg.get('step_name').strip(), self.dare_id) # start work unit compute_unit_description = { "executable": cu_conf["executable"], "arguments": self.prepare_cu_arguments(input_file,cu_conf), "total_core_count": 1, "number_of_processes": 1, #"working_directory": cu_working_directory, "output":"dare-cu-stdout-"+ cu_uuid +".txt", "error": "dare-cu-stderr-"+ cu_uuid +".txt", "affinity_datacenter_label": "%s-adl"%step_info_from_main_cfg.get('resource', self.dare_conf_main['used_pilots'].split(',')[0]).strip(), "affinity_machine_label": "%s-aml"%step_info_from_main_cfg.get('resource', self.dare_conf_main['used_pilots'].split(',')[0]).strip() } self.compute_units_repo[cu_uuid]=compute_unit_description # add this cu to step self.step_units_repo[cu_step_id].add_cu(cu_uuid) darelogger.info("Done preparing compute Units ")
def start(self): darelogger.info("Creating Compute Engine service ") self.pilot_compute_service = PilotComputeService( coordination_url=COORDINATION_URL) self.pilot_data_service = PilotDataService( coordination_url=COORDINATION_URL) for compute_pilot, desc in list( self.workflow.compute_pilot_repo.items()): self.pilot_compute_service.create_pilot( pilot_compute_description=desc) for data_pilot, desc in list(self.workflow.data_pilot_repo.items()): self.data_pilot_service_repo.append( self.pilot_data_service.create_pilot( pilot_data_description=desc)) self.compute_data_service = ComputeDataServiceDecentral() self.compute_data_service.add_pilot_compute_service( self.pilot_compute_service) self.compute_data_service.add_pilot_data_service( self.pilot_data_service) ### run the steps self.step_start_lock = threading.RLock() self.step_run_lock = threading.RLock() for step_id in list(self.workflow.step_units_repo.keys()): darelogger.info(" Sumitted step %s " % step_id) self.step_start_lock.acquire() self.start_thread_step_id = step_id self.step_start_lock.release() self.step_threads[step_id] = threading.Thread( target=self.start_step) self.step_threads[step_id].start() while (1): count_step = [ v.is_alive() for k, v in list(self.step_threads.items()) ] darelogger.info('count_step %s' % count_step) if not True in count_step and len(count_step) > 0: break time.sleep(10) darelogger.info(" All Steps Done processing") self.quit(message='quit gracefully')
def start(self): # try: from pilot import PilotComputeService, PilotDataService, ComputeDataService, State darelogger.info("Create Compute Engine service ") self.pilot_compute_service = PilotComputeService(coordination_url=COORDINATION_URL) self.pilot_data_service = PilotDataService() for compute_pilot, desc in self.workflow.compute_pilot_repo.items(): self.compute_pilot_service_repo.append(self.pilot_compute_service.create_pilot(pilot_compute_description=desc)) #for data_pilot, desc in self.workflow.data_pilot_repo.items(): # self.data_pilot_service_repo.append(self.pilot_data_service.create_pilot(pilot_data_description=desc)) self.compute_data_service = ComputeDataService() self.compute_data_service.add_pilot_compute_service(self.pilot_compute_service) # self.compute_data_service.add_pilot_data_service(self.pilot_data_service) self.step_thread= {} ### run the steps self.step_start_lock=threading.RLock() self.step_run_lock=threading.RLock() for step_id in self.workflow.step_units_repo.keys(): darelogger.info(" Sumitted step %s "%step_id) self.step_start_lock.acquire() self.start_thread_step_id =step_id self.step_start_lock.release() self.step_thread[step_id] = threading.Thread(target=self.start_step) self.step_thread[step_id].start() while(1): count_step = [v.is_alive() for k,v in self.step_thread.items()] darelogger.info('count_step %s'%count_step) if not True in count_step and len(count_step)>0: break time.sleep(10) darelogger.info(" All Steps Done processing") self.cancel()
def prepare_pilot_units(self): darelogger.info("Starting to prepare pilot Units") pilot_config_file = self.dare_conf_main.get('pilot_config_file', 'default') for pilot in self.dare_conf_main['used_pilots'].split(','): pilot = pilot.strip() compute_pilot_uuid = "compute-pilot-%s-%s"%(pilot, str(uuid.uuid1())) data_pilot_uuid = "compute-pilot-%s-%s"%(pilot, str(uuid.uuid1())) pilot_info_from_main_cfg = self.dare_conf_full.SectionDict(pilot) darelogger.info("Preparing pilot unit for %s"%pilot) pilot_config_file = pilot_info_from_main_cfg.get('pilot_config_file', "undefined_pilot_file") if pilot_config_file.lower() == 'default' or pilot_config_file.lower() == 'undefined_pilot_file': pilot_config_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'daredb', 'pilot.cfg') pilot_config_from_db = CfgParser(pilot_config_file) info_pilot = pilot_config_from_db.SectionDict(pilot) # create pilot job service and initiate a pilot job pilot_compute_description = { "service_url": info_pilot['service_url'], "working_directory": info_pilot['working_directory'], 'affinity_datacenter_label': '%s-adl'%pilot, 'affinity_machine_label': '%s-aml'%pilot , "number_of_processes": int(pilot_info_from_main_cfg['number_of_processes']), "walltime" : int(pilot_info_from_main_cfg['walltime']) } self.compute_pilot_repo[compute_pilot_uuid] = pilot_compute_description pilot_data_description={ "service_url": info_pilot['data_service_url'], "size": 100, "affinity_datacenter_label": pilot+'-dcl', "affinity_machine_label": pilot + '-aml' } self.data_pilot_repo[data_pilot_uuid] = pilot_data_description darelogger.info("Done preparing Pilot Units ")
def prepare_step_units(self): darelogger.info("Starting to prepare Step Units ") #TODO:: check for same names for step in self.dare_conf_main['steps'].split(','): darelogger.info("Preparing Step Units: %s"%step) try: step_info_from_main_cfg = self.dare_conf_full.SectionDict(step.strip()) except: darelogger.info("step description section not found for step %s"%step) sys.exit() start_after_steps = [] if step_info_from_main_cfg.get('start_after_steps'): start_after_steps = ["step-%s-%s"%(k.strip(),self.dare_id) for k in step_info_from_main_cfg.get('start_after_steps').split(',')] step_unit_uuid = "step-%s-%s"%(step_info_from_main_cfg.get('step_name').strip(), self.dare_id) info_steps = { "step_id":step_unit_uuid, "dare_web_id":self.dare_web_id , "name":step_info_from_main_cfg.get('step_name').strip(), "status": StepUnitStates.New, "pilot": step_info_from_main_cfg.get('pilot'), "start_after_steps":start_after_steps , "compute_units":[], "transfer_input_data_units":[], "transfer_output_data_units":[] } su = StepUnit() su.define_param(info_steps) self.step_units_repo[step_unit_uuid] = su darelogger.info("Done preparing Step Units ")
def prepare_data_units(self): darelogger.info("Starting to prepare Data Units ") for step in self.dare_conf_main['steps'].split(','): darelogger.info("Preparing Data Units: %s"%step) try: step_info_from_main_cfg = self.dare_conf_full.SectionDict(step.strip()) except: darelogger.info("step description section not found for step %s"%step) sys.exit() #print step_cfg_file absolute_url_list = step_info_from_main_cfg.get('input_names', '').split(',') # Create Data Unit Description # base_dir = "/Users/Sharath/workspace/projects/backups" # url_list = os.listdir(base_dir) # absolute_url_list = [os.path.join(base_dir, i) for i in url_list] du_uuid = "du-%s"%(uuid.uuid1(),) # make absolute paths du_step_id = "step-%s-%s"%(step.strip(), self.dare_id) data_unit_description = { "file_urls":absolute_url_list, "affinity_datacenter_label": "eu-de-south", "affinity_machine_label": "mymachine-1" } # submit pilot data to a pilot store self.data_units_repo[du_uuid]=data_unit_description # add this cu to step self.step_units_repo[du_step_id].add_input_du(du_uuid)
def run_step(self, step_id): #self.step_run_lock.acquire() #job started update status this_su = self.workflow.step_units_repo[step_id].UnitInfo self.updater.update_status( this_su['dare_web_id'], "%s in step %s" % ('Running', this_su['name'])) darelogger.info(" Started running %s " % step_id) jobs = [] job_start_times = {} job_states = {} NUMBER_JOBS = len( self.workflow.step_units_repo[step_id].UnitInfo['compute_units']) for cu_id in self.workflow.step_units_repo[step_id].UnitInfo[ 'compute_units']: compute_unit_desc = self.workflow.compute_units_repo[cu_id] input_dus = compute_unit_desc.pop('input_data_units') output_dus = compute_unit_desc.pop('output_data_units') input_data_units = [] for du_id in input_dus: input_data_units.append( self.compute_data_service.submit_data_unit( self.workflow.data_units_repo[du_id])) output_data_units = [] for du_id in output_dus: output_data_units.append( self.compute_data_service.submit_data_unit( self.workflow.data_units_repo[du_id])) compute_unit_desc["input_data"] = [ du.get_url() for du in input_data_units ] compute_unit_desc["output_data"] = [{ du.get_url(): ['std*'] } for du in output_data_units] compute_unit = self.compute_data_service.submit_compute_unit( compute_unit_desc) darelogger.info("Compute Unit: Description: \n%s" % (str(self.workflow.compute_units_repo[cu_id]))) jobs.append(compute_unit) job_start_times[compute_unit] = time.time() job_states[compute_unit] = compute_unit.get_state() darelogger.debug( "************************ All Jobs submitted ************************" ) while 1: finish_counter = 0 result_map = {} for i in range(0, NUMBER_JOBS): old_state = job_states[jobs[i]] state = jobs[i].get_state() if state in result_map == False: result_map[state] = 0 result_map[state] = result_map.get(state, 0) + 1 #print "counter: " + str(i) + " job: " + str(jobs[i]) + " state: " + state if old_state != state: darelogger.debug("Job " + str(jobs[i]) + " changed from: " + old_state + " to " + state) if old_state != state and self.has_finished(state) == True: darelogger.info("%s step Job: " % (self.workflow.step_units_repo[step_id]. UnitInfo['name']) + str(jobs[i]) + " Runtime: " + str(time.time() - job_start_times[jobs[i]]) + " s.") if self.has_finished(state) == True: finish_counter = finish_counter + 1 job_states[jobs[i]] = state darelogger.debug("Current states: " + str(result_map)) time.sleep(5) if finish_counter == NUMBER_JOBS: break self.workflow.step_units_repo[step_id].set_status(StepUnitStates.Done) #self.compute_data_service.wait() darelogger.debug(" Compute jobs for step %s complete" % step_id) #runtime = time.time()-starttime #all jobs done update status self.updater.update_status(this_su['dare_web_id'], "%s is Done" % this_su['name'])
def run_step(self, step_id): #self.step_run_lock.acquire() #job started update status this_su = self.workflow.step_units_repo[step_id].UnitInfo self.updater.update_status(this_su['dare_web_id'], "%s in step %s" % ('Running', this_su['name'])) darelogger.info(" Started running %s " % step_id) jobs = [] job_start_times = {} job_states = {} NUMBER_JOBS = len(self.workflow.step_units_repo[step_id].UnitInfo['compute_units']) for cu_id in self.workflow.step_units_repo[step_id].UnitInfo['compute_units']: compute_unit_desc = self.workflow.compute_units_repo[cu_id] input_dus = compute_unit_desc.pop('input_data_units') output_dus = compute_unit_desc.pop('output_data_units') input_data_units = [] for du_id in input_dus: input_data_units.append(self.compute_data_service.submit_data_unit(self.workflow.data_units_repo[du_id])) output_data_units = [] for du_id in output_dus: output_data_units.append(self.compute_data_service.submit_data_unit(self.workflow.data_units_repo[du_id])) compute_unit_desc["input_data"] = [du.get_url() for du in input_data_units] compute_unit_desc["output_data"] = [{du.get_url(): ['std*']} for du in output_data_units] compute_unit = self.compute_data_service.submit_compute_unit(compute_unit_desc) darelogger.info("Compute Unit: Description: \n%s" % (str(self.workflow.compute_units_repo[cu_id]))) jobs.append(compute_unit) job_start_times[compute_unit] = time.time() job_states[compute_unit] = compute_unit.get_state() darelogger.debug("************************ All Jobs submitted ************************") while 1: finish_counter = 0 result_map = {} for i in range(0, NUMBER_JOBS): old_state = job_states[jobs[i]] state = jobs[i].get_state() if state in result_map == False: result_map[state] = 0 result_map[state] = result_map.get(state, 0) + 1 #print "counter: " + str(i) + " job: " + str(jobs[i]) + " state: " + state if old_state != state: darelogger.debug("Job " + str(jobs[i]) + " changed from: " + old_state + " to " + state) if old_state != state and self.has_finished(state) == True: darelogger.info("%s step Job: " % (self.workflow.step_units_repo[step_id].UnitInfo['name']) + str(jobs[i]) + " Runtime: " + str(time.time() - job_start_times[jobs[i]]) + " s.") if self.has_finished(state) == True: finish_counter = finish_counter + 1 job_states[jobs[i]] = state darelogger.debug("Current states: " + str(result_map)) time.sleep(5) if finish_counter == NUMBER_JOBS: break self.workflow.step_units_repo[step_id].set_status(StepUnitStates.Done) #self.compute_data_service.wait() darelogger.debug(" Compute jobs for step %s complete" % step_id) #runtime = time.time()-starttime #all jobs done update status self.updater.update_status(this_su['dare_web_id'], "%s is Done" % this_su['name'])