def launch_resources_agents(self,resource_list, resources_used_info): try: # submit via mj abstraction ## start the big job agents resource_list = [] i =0 print all_resources_used[resource]["resource_url"] for resource in resources_used: num_nodes= calculate_nodes(resources_used_info[resource]["cores_per_node"],\ resources_job_count[i],\ resources_app[resource][cores]) resource_list.append({ \ "resource_url" : resources_used_info[resource]["resource_url"] , \ "walltime": resources_used_info[resource]["walltime"] , \ "number_nodes" : str(1), \ "cores_per_node" : resources_used_info[resource]["cores_per_node"], \ "allocation" : resources_used_info[resource]["allocation"], \ "queue" : resources_used_info[resource]["queue"], \ "bigjob_agent": resources_used_info[resource]["bigjob_agent"], \ "userproxy": resources_used_info[resource]["proxy"], \ "working_directory": resources_used_info[resource]["work_dir"],\ "affinity" : resources_used_info[resource]["affinity"} \ ) logger.info("resource_url" + resources_url[i]) logger.info("affinity%s"%(i)) print "Create manyjob service " #create multiple manyjobs should be changed by bfast affinity implementation i = i+1 #decide type of bigjob to use here resources_service = many_job_affinity.many_job_service(resource_list, \ DARE_ADVERT_HOST) except: traceback.print_exc(file=sys.stdout) terminate_resources_agents(resources_service) return resources_service def terminate_resources_agents(self,resource_service): try: resource_service.cancel() except: pass class subjob_handler(api.base.subjob_handler): def __init__(): pass def has_finished(state): state = state.lower() if state=="done" or state=="failed" or state=="canceled": return True else: return False def create_subjob( job_description, jd_arguments, affinity, handler_resource): # create job description jd = saga.job.description() jd.executable = str(job_description["jd_executable"] jd.number_of_processes = str(job_description["jd_number_of_processes"] jd.spmd_variation = job_description["jd_spmd_variation"] # choose the job arguments based on type of job jd.arguments = jd_arguments jd.environment = ["affinity=%s"%(affinity)] jd.working_directory = job_description["jd_work_dir"] jd.output = job_description["jd_output"] jd.error = job_description["jd_error"] logger.info( "subjob " + str(i)) logger.info( "jd.number_of_processes " + str(jd.number_of_processes)) for item in jd.arguments: logger.info( "jd.arguments" + item) logger.info("%s"%(affinity)) logger.info( "jd exec" + jd.executable) return jd def submit_subjob(jd, handler_resource): try: subjob = handler_resource.create_job(jd) subjob.run() print "Submited sub-job " + "%d"%i + "." return subjob except: traceback.print_exc(file=sys.stdout) try: handler_resource.cancel() except: pass def add_subjob_to_list(subjob): jobs.append(subjob) job_start_times[subjob]=time.time() job_states[subjob] = subjob.get_state() def monitor_subjobs(number_of_jobs): logger.info("********All Jobs submitted********" + str(number_of_jobs)) while 1: finish_counter=0 result_map = {} for i in range(0, number_of_jobs): old_state = job_states[jobs[i]] state = jobs[i].get_state() if result_map.has_key(state) == False: result_map[state]=0 result_map[state] = result_map[state]+1 #print "counter: " + str(i) + " job: " + str(jobs[i]) + " state: " + state if old_state != state: logger.info("Job " + str(jobs[i]) + " changed from: " + old_state + \ " to " + state) if old_state != state and has_finished(state)==True: logger.info("Job: " + str(jobs[i]) + " Runtime: " + \ str(time.time()-job_start_times[jobs[i]]) + " s.") if has_finished(state)==True: finish_counter = finish_counter + 1 job_states[jobs[i]]=state logger.info("Current states: " + str(result_map)) time.sleep(5) logger.info("Current states: " + str(result_map)) if finish_counter == number_of_jobs: break class file_handler(api.base.file_handler) def file_stager(source_url, dest_url): logger.info("Now I am tranferring the files from %s to %s"%(source_url, dest_url)) #fgeuca for clouds if dest_url.startswith("fgeuca"): try: #for cloud files cmd = "scp -r -i /path/to/smaddi2.private %s %s"%(source_url, dest_url) os.system(cmd) except saga.exception, e: error_msg = "File stage in failed : from "+ source_url + " to "+ dest_url else: try: cmd = "globus-url-copy -cd %s %s"%(source_url, dest_url) os.system(cmd) except saga.exception, e: error_msg = "File stage in failed : from "+ source_url + " to "+ dest_url return None if __name__ == "__main__": config = {} #define app name DARE_APP_NAME="BFAST" DARE_UUID = uuid.uuid1() DARE_ADVERT_HOST = "advert.cct.lsu.edu" # parse options parser = optparse.OptionParser() parser.add_option("-j", "--job-conf", dest="job_conf", help="job configuration file") (options, args) = parser.parse_args() #read job conf file job_info = read_job_conf(options.job_conf) job_id = job_info['job_id'] resources_used = [] resources_used = job_info['resources_use'].replace(' ','').split(',') resources_used_job_count = job_info['resources_job_count'].replace(' ','').split(',') resources_used_walltime = job_info['resources_walltime'].replace(' ','').split(',') #read resource_info conf file resources_used_info = read_conf(job_info['resources_info_filename'],resources_used) #read resource_app_info conf file resources_used_app_info = read_conf(job_info['resources_app_info_filename'],resources_used) #get the current working directory cwd = os.getcwd() #define log filename logger = set_logger(DARE_APP_NAME, LOG_FILENAME) logger.info("Reading conf files is done ") logger.info("Job id is " + str(job_id) ) logger.info("Machine used are ") for i in ranger(resource_used): logger.info("resources_used[i]") #launch manyjob affinity job_service = launch_manyjob(self,resource_list) #create subjob description jd = create_subjob("/bin/date" , 1, "single", [""], "LONI", \ "/work/smaddi2/", "/work/smaddi2/stdout-1-now.out", \ "/work/smaddi2/stderr-1-now.out") #submit subjob submit_subjob(jd,job_service) # wait for submitted subjobs to get to state done wait_for_subjobs(1) terminate_resource_agents(job_service)
def add_resources(self,resource_list, resources_used_info): try: # submit via mj abstraction ## start the big job agents resource_list = [] i =0 print all_resources_used[resource]["resource_url"] for resource in resources_used: num_nodes= calculate_nodes(resources_used_info[resource]["cores_per_node"],\ resources_job_count[i],\ resources_app[resource][cores]) resource_list.append({ \ "resource_url" : resources_used_info[resource]["resource_url"] , \ "walltime": resources_used_info[resource]["walltime"] , \ "number_nodes" : str(1), \ "cores_per_node" : resources_used_info[resource]["cores_per_node"], \ "allocation" : resources_used_info[resource]["allocation"], \ "queue" : resources_used_info[resource]["queue"], \ "bigjob_agent": resources_used_info[resource]["bigjob_agent"], \ "userproxy": resources_used_info[resource]["proxy"], \ "working_directory": resources_used_info[resource]["work_dir"],\ "affinity" : resources_used_info[resource]["affinity"} \ ) logger.info("resource_url" + resources_url[i]) logger.info("affinity%s"%(i)) print "Create manyjob service " #create multiple manyjobs should be changed by bfast affinity implementation i = i+1 #decide type of bigjob to use here resources_service = many_job_affinity.many_job_service(resource_list, \ DARE_ADVERT_HOST) except: traceback.print_exc(file=sys.stdout) terminate_resources_agents(resources_service) return resources_service def terminate_resources(self,resource_service): try: resource_service.cancel() except: pass class subjob_handler(api.base.subjob_handler): def __init__(): pass def has_finished(state): state = state.lower() if state=="done" or state=="failed" or state=="canceled": return True else: return False def create_subjob( job_description, jd_arguments, affinity, handler_resource): # create job description jd = saga.job.description() jd.executable = str(job_description["jd_executable"] jd.number_of_processes = str(job_description["jd_number_of_processes"] jd.spmd_variation = job_description["jd_spmd_variation"] # choose the job arguments based on type of job jd.arguments = jd_arguments jd.environment = ["affinity=%s"%(affinity)] jd.working_directory = job_description["jd_work_dir"] jd.output = job_description["jd_output"] jd.error = job_description["jd_error"] logger.info( "subjob " + str(i)) logger.info( "jd.number_of_processes " + str(jd.number_of_processes)) for item in jd.arguments: logger.info( "jd.arguments" + item) logger.info("%s"%(affinity)) logger.info( "jd exec" + jd.executable) return jd def submit_subjob(jd, handler_resource): try: subjob = handler_resource.create_job(jd) subjob.run() print "Submited sub-job " + "%d"%i + "." return subjob except: traceback.print_exc(file=sys.stdout) try: handler_resource.cancel() except: pass def add_subjob_to_list(subjob): jobs.append(subjob) job_start_times[subjob]=time.time() job_states[subjob] = subjob.get_state() def monitor_subjobs(number_of_jobs): logger.info("********All Jobs submitted********" + str(number_of_jobs)) while 1: finish_counter=0 result_map = {} for i in range(0, number_of_jobs): old_state = job_states[jobs[i]] state = jobs[i].get_state() if result_map.has_key(state) == False: result_map[state]=0 result_map[state] = result_map[state]+1 #print "counter: " + str(i) + " job: " + str(jobs[i]) + " state: " + state if old_state != state: logger.info("Job " + str(jobs[i]) + " changed from: " + old_state + \ " to " + state) if old_state != state and has_finished(state)==True: logger.info("Job: " + str(jobs[i]) + " Runtime: " + \ str(time.time()-job_start_times[jobs[i]]) + " s.") if has_finished(state)==True: finish_counter = finish_counter + 1 job_states[jobs[i]]=state logger.info("Current states: " + str(result_map)) time.sleep(5) logger.info("Current states: " + str(result_map)) if finish_counter == number_of_jobs: break class file_handler(api.base.file_handler) def file_stager(source_url, dest_url): logger.info("Now I am tranferring the files from %s to %s"%(source_url, dest_url)) #fgeuca for clouds if dest_url.startswith("fgeuca"): try: #for cloud files cmd = "scp -r -i /path/to/smaddi2.private %s %s"%(source_url, dest_url) os.system(cmd) except saga.exception, e: error_msg = "File stage in failed : from "+ source_url + " to "+ dest_url else: try: cmd = "globus-url-copy -cd %s %s"%(source_url, dest_url) os.system(cmd) except saga.exception, e: error_msg = "File stage in failed : from "+ source_url + " to "+ dest_url return None