def write_checkpoint(checkpoint_tag, checkpoint_tag_next_list, checkpoint_route_tag_next_list, output_files): # checkpoint_tag # common, route_A (mutation route) or route_B (sv route) # checkpoint_tag_next_list, checkpoint_route_tag_next_list # next task(s) and route(s) elapsed_time = time.time() logger = getLogger(__name__) logger.setLevel(INFO) verbose = int(get_vge_conf("vge", "verbose", 0)) if verbose == 0: logger.setLevel(CRITICAL) elif verbose == 1 or verbose == 2: logger.setLevel(INFO) elif verbose == 3: logger.setLevel(DEBUG) basicConfig(format='%(asctime)s:%(levelname)s:%(message)s') pname = "Pipeline(CHECKPOINT WRITE)" process_id = os.getpid() logger.info("%s pid[%i]: collecting files in target output directory...." % (pname, process_id)) # exclude tasks that did not write checkpoints exclude_checkpoint_list = list( get_vge_conf("restart", "exclude_checkpoint_list", [])) for exclude_checkpoint_tag in exclude_checkpoint_list: if checkpoint_tag == exclude_checkpoint_tag: logger.warning("%s pid[%i]: exclude task for %s" % (pname, process_id, exclude_checkpoint_tag)) elapsed_time = time.time() - elapsed_time logger.info("%s pid[%i]: write_checkpoint elapsed_time %f[sec]" % (pname, process_id, elapsed_time)) return 0 target_dir = chk_target_dir checkpoint_dir = chk_checkpoint_dir # create checkpoint data directory if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) target_list = [] for route_tag_index in range(len(checkpoint_route_tag_next_list)): elapsed_time_mkchkpoint1 = time.time() # set target directory target_absdir = os.path.abspath(target_dir) target_absdir = os.path.normpath(target_absdir) # make a target_dir_dict targetdir_dict = {} targetdir_dict["filename_list"] = [] targetdir_dict["size_list"] = [] flag = 0 for root, dirs, files in os.walk(target_absdir): #dirs[:] = [d for d in dirs if os.path.basename(checkpoint_dir) not in os.path.join(root, d)] dirs[:] = [ d for d in dirs if os.path.abspath(checkpoint_dir) not in os.path.join( root, d) ] if flag == 0: for d in dirs: target_list.append(os.path.join(root, d)) for filename in files: target_list.append(os.path.join(root, filename)) flag = 1 target_dirname = root[len(target_absdir) + 1:] if target_dirname == "": target_dirname = "." # for filename in files: # targetdir_dict["filename_list"].append(target_dirname + "/" + filename) # targetdir_dict["size_list"].append(os.stat(root + "/" + filename).st_size) for filename in files: if os.path.isfile(root + "/" + filename): targetdir_dict["filename_list"].append(target_dirname + "/" + filename) targetdir_dict["size_list"].append( os.stat(root + "/" + filename).st_size) checkpointdata = {} checkpointdata["outputfiles"] = output_files checkpointdata["targetdir"] = target_dir checkpointdata["targetdirdict"] = targetdir_dict checkpoint_absdir = os.path.abspath(checkpoint_dir) checkpoint_absdir = os.path.normpath(checkpoint_absdir) # make restartdata filename temp_checkpoint_tag = checkpoint_tag.replace(" ", "") checkpoint_filename = "checkpoint_" + temp_checkpoint_tag + ".chk" writefilenamepath = os.path.join(checkpoint_absdir, checkpoint_filename) writefilenamepath = os.path.normpath(writefilenamepath) err_flag = False # serialize the checkpoint data if not err_flag: try: package = pickle.dumps(checkpointdata) except Exception as e: logger.info( "%s pid[%i]: error was occured when serializing... [%s]. " % (pname, process_id, str(e))) err_flag = True elapsed_time_mkchkpoint2 = time.time() # write data if not err_flag: try: if os.path.isfile(writefilenamepath): logger.warning( "%s pid[%i]: (WRITE WARNING) checkpoint file already exists in [%s] so it will be removed and written." % (pname, process_id, writefilenamepath)) os.remove(writefilenamepath) write_checkpoint = open(writefilenamepath, "wb") pickle.dump(package, write_checkpoint) write_checkpoint.close() except Exception as e: logger.info( "%s pid[%i]: checkpoint file write error was occured [%s]. " % (pname, process_id, str(e))) err_flag = True return 1 elapsed_time_mkchkpoint3 = time.time() # target_dir backup restartdata_dir = os.path.join(checkpoint_dir, "backupdata") restartdata_dir = os.path.abspath(restartdata_dir) restartdata_dir = os.path.normpath( restartdata_dir ) + "_" + checkpoint_route_tag_next_list[route_tag_index] restartdata_dir = os.path.normpath(restartdata_dir) if not os.path.isdir(restartdata_dir): os.mkdir(restartdata_dir) elapsed_time_mkchkpoint4 = time.time() # backup to restartdata_dir by hardlink excpet checkpoint direcotry backuplist = target_list for backupdir in backuplist: if backupdir == os.path.basename(checkpoint_dir): continue else: #src_target_dir = target_dir + "/" + backupdir src_target_dir = backupdir #if not os.path.isdir(src_target_dir): # continue args = ["cp", "-lr", src_target_dir, restartdata_dir] ret = subprocess.call(args) logger.info( "%s pid[%i]: (WRITE WARNING) target_dir backup %s to %s" % (pname, process_id, src_target_dir, restartdata_dir)) logger.warning( "%s pid[%i]: (WRITE WARNING) target_dir backup subprocess.call returncode %d" % (pname, process_id, ret)) if not err_flag: tag_next_len = len(checkpoint_tag_next_list) route_tag_next_len = len(checkpoint_route_tag_next_list) if tag_next_len != route_tag_next_len: logger.warning( "%s pid[%i]: (WRITE WARNING) difference in the length of checkpoint_tag_next_list(%d) and checkpoint_route_tag_next_list(%d)" % (pname, process_id, tag_next_len, route_tag_next_len)) elif tag_next_len > 2 or route_tag_next_len > 2: logger.warning( "%s pid[%i]: (WRITE WARNING) checkpoint_tag_next_list(%d) and checkpoint_route_tag_next_list(%d) greater than 2 " % (pname, process_id, tag_next_len, route_tag_next_len)) route_tag_next_str = str( checkpoint_route_tag_next_list[route_tag_index]) tag_next_str = str(checkpoint_tag_next_list[route_tag_index]) # make restartmasterdata filename checkpointmaster_filename = "checkpoint_" + route_tag_next_str + ".mst" restartinfo_filename = os.path.join(checkpoint_absdir, checkpointmaster_filename) restartinfo_filename = os.path.normpath(restartinfo_filename) # append or write restart info restart_info_list = [] restart_info_list = [tag_next_str, writefilenamepath] restart_info_string = ",".join(restart_info_list) restart_info_string += "\n" try: if os.path.isfile(restartinfo_filename): logger.debug( "%s pid[%i]: restart master file was already found in [%s]. " % (pname, process_id, restartinfo_filename)) write_masterfile = open(restartinfo_filename, "a") else: logger.info( "%s pid[%i]: restart master file was newly created in [%s]. " % (pname, process_id, restartinfo_filename)) write_masterfile = open(restartinfo_filename, "w") write_masterfile.write(restart_info_string) write_masterfile.close() except (IOError, OSError) as e: logger.error( "%s pid[%i]: ERROR! ... write error was occured [%s]. " % (pname, process_id, str(e))) err_flag = True return 1 elapsed_time_mkchkpoint5 = time.time() logger.info( "%s pid[%i]: write_checkpoint tag %s elapsed_time_mkchkpoint %f[sec]" % (pname, process_id, route_tag_index, elapsed_time_mkchkpoint2 - elapsed_time_mkchkpoint1)) logger.info( "%s pid[%i]: write_checkpoint tag %s elapsed_time_write_chkpoint %f[sec]" % (pname, process_id, route_tag_index, elapsed_time_mkchkpoint3 - elapsed_time_mkchkpoint2)) logger.info( "%s pid[%i]: write_checkpoint tag %s elapsed_time_mk_targetdir %f[sec]" % (pname, process_id, route_tag_index, elapsed_time_mkchkpoint4 - elapsed_time_mkchkpoint3)) logger.info( "%s pid[%i]: write_checkpoint tag %s elapsed_time_hardlink %f[sec]" % (pname, process_id, route_tag_index, elapsed_time_mkchkpoint5 - elapsed_time_mkchkpoint4)) elapsed_time = time.time() - elapsed_time logger.info("%s pid[%i]: write_checkpoint elapsed_time %f[sec]" % (pname, process_id, elapsed_time)) return 0
def checkpoint_restart(target_dir): elapsed_time = time.time() # init_restart_conf() logger = getLogger(__name__) logger.setLevel(INFO) verbose = int(get_vge_conf("vge", "verbose", 0)) if verbose == 0: logger.setLevel(CRITICAL) elif verbose == 1 or verbose == 2: logger.setLevel(INFO) elif verbose == 3: logger.setLevel(DEBUG) basicConfig(format='%(asctime)s:%(levelname)s:%(message)s') pname = "Pipeline(CHECKPOINT RESTART)" process_id = os.getpid() logger.info("%s pid[%i]: checkpoint judgment and reading start task " % (pname, process_id)) chk_file = int(get_vge_conf("restart", "chk_file", 2)) # common parameters global chk_work_dir global chk_target_dir global chk_checkpoint_dir work_dir_path = os.path.dirname(target_dir) work_dir_path = os.path.abspath(work_dir_path) work_dir_path = os.path.normpath(work_dir_path) checkpoint_dir_path = os.path.join(target_dir, "checkpoint") checkpoint_dir_path = os.path.abspath(checkpoint_dir_path) checkpoint_dir_path = os.path.normpath(checkpoint_dir_path) target_dir_path = os.path.abspath(target_dir) target_dir_path = os.path.normpath(target_dir_path) chk_work_dir = work_dir_path chk_target_dir = target_dir_path chk_checkpoint_dir = checkpoint_dir_path restartdata_dir = os.path.join(chk_checkpoint_dir, "backupdata") restartdata_dir = os.path.abspath(restartdata_dir) restartdata_dir = os.path.normpath(restartdata_dir) # reconst_checkpoint flag_skiptask = {} flag_starttask = {} if not os.path.isdir(chk_checkpoint_dir): logger.info("%s pid[%i]: normal start" % (pname, process_id)) elapsed_time = time.time() - elapsed_time logger.info("%s pid[%i]: checkpoint_restart elapsed_time %f[sec]" % (pname, process_id, elapsed_time)) return flag_skiptask, flag_starttask, {} # checkpoint mstart file masterfile_list = [] # create a list of checkpoint master for filename in os.listdir(chk_checkpoint_dir): basename, ext = os.path.splitext(filename) if ".mst" == ext: masterfile_list.append(filename) # not restart if no checkpoint master file if not masterfile_list: logger.info("%s pid[%i]: not restart, checkpoint start" % (pname, process_id)) elapsed_time = time.time() - elapsed_time logger.info("%s pid[%i]: checkpoint_restart elapsed_time %f[sec]" % (pname, process_id, elapsed_time)) return flag_skiptask, flag_starttask, {} set_task_list = [] set_checkpointfiles = {} dict_route = {} for mstfile in masterfile_list: chk_masterfile = os.path.join(chk_checkpoint_dir, mstfile) logger.debug("%s pid[%i]: checkpoint master file path: %s" % (pname, process_id, chk_masterfile)) if os.path.isfile(chk_masterfile): try: checkpoint_master = open(chk_masterfile, "rb") logger.info( "%s pid[%i]: restart start, open checkpoint master" % (pname, process_id)) tmplist = checkpoint_master.readlines() checkpoint_master.close() except IOError as e: logger.info("%s pid[%i]: error was masterfile open [%s]. " % (pname, process_id, str(e))) exit() else: logger.info("%s pid[%i]: not restart, checkpoint start" % (pname, process_id)) exit() # check the contents of checkpoint master file # the first column of master file is a task that created the checkpoint # the second column is the path of the checkpoint # do nothing if the first column is "final" for line in tmplist: line_list = line.split(",") task = line_list[0] set_task_list.append(task) if len(set_task_list) == 1: set_checkpointfiles[task] = [line_list[1].strip()] # compare the previous task name with the current task name elif set_task_list[-2] == set_task_list[-1]: set_checkpointfiles[task].append(line_list[1].strip()) else: set_checkpointfiles[task] = [line_list[1].strip()] if set_task_list[-1] != "final": logger.debug("%s pid[%i]: set_task_list: %s" % (pname, process_id, set_task_list)) flag_starttask[set_task_list[-1]] = True # create a dictionary that links task names with root tags # dict_route[task] contins the root tag of master file # basename[0:10] contains "checkpoint_" basename, ext = os.path.splitext(mstfile) dict_route[task] = basename[11:] # set flag from master file for task in set_task_list: # skip if the checkpoint is already written # checkpointtag contains the task name of checkpoint file # basename[0:10] contains "checkpoint_" for checkpointpath in set_checkpointfiles[task]: basename, ext = os.path.splitext(os.path.basename(checkpointpath)) checkpointtag = basename[11:] flag_skiptask[checkpointtag] = True # read checkpointdata # verify_checkpoint restart_files_list_dict = {} for task, startflag in flag_starttask.items(): new_outputfiles_list = [] for checkpointpath in set_checkpointfiles[task]: # modify checkpoint data path chkpoint_filename = os.path.basename(checkpointpath) checkpoint_absdir = os.path.abspath(chk_checkpoint_dir) checkpoint_absdir = os.path.normpath(checkpoint_absdir) new_checkpoint_filepath = os.path.join(checkpoint_absdir, chkpoint_filename) new_checkpoint_filepath = os.path.normpath(new_checkpoint_filepath) readfilename = new_checkpoint_filepath checkpointdata = None binalyfile = None # open checkpoint data try: binalyfile = open(readfilename, "rb") loaded_package = pickle.load(binalyfile) binalyfile.close() checkpointdata = pickle.loads(loaded_package) except IOError: exit() # check file integrity if chk_file == 0: pass elif chk_file == 1: restart_input_files_set = set() for root, dirs, files in os.walk(restartdata_dir + "_" + dict_route[task]): target_dirname = root[len(restartdata_dir + "_" + dict_route[task]) + 1:] if target_dirname == "": target_dirname = "." for filename in files: restart_input_files_set.add(target_dirname + "/" + filename) if not (set(checkpointdata["targetdirdict"]["filename_list"]) <= restart_input_files_set): logger.warning( "%s pid[%i]: (VERIFY WARNING) can not restart because file size mismatch...." % (pname, process_id)) logger.debug( "%s pid[%i]: (VERIFY WARNING) restart_input_files_set: %s" % (pname, process_id, restart_input_files_set)) logger.debug( "%s pid[%i]: (VERIFY WARNING) set(checkpointdata[\"targetdirdict\"][\"filename_list\"]): %s" % (pname, process_id, set(checkpointdata["targetdirdict"]["filename_list"])) ) exit() elif chk_file == 2: for i in range( len(checkpointdata["targetdirdict"]["filename_list"])): restart_data_file = os.path.join( restartdata_dir + "_" + dict_route[task], checkpointdata["targetdirdict"]["filename_list"][i]) if os.path.islink( restart_data_file ) and not os.path.isfile(restart_data_file): # modify a link in a symbolic link file current_dir = os.getcwd() current_dir_len = len(current_dir.split("/")) realpath = os.path.realpath(restart_data_file) # replace the name of the current directory # because Kei computer uses stagin system realpath_split = realpath.split("/") dircount = 0 new_path = "" for dirname in realpath_split: dircount = dircount + 1 if dircount > current_dir_len: new_path = new_path + "/" + dirname new_src_file = os.path.normpath(current_dir + new_path) os.unlink(restart_data_file) os.symlink(new_src_file, restart_data_file) logger.debug( "%s pid[%i]: re symbolic link %s to %s" % (pname, process_id, restartdata_dir, new_src_file)) if not os.path.isfile(restart_data_file): logger.warning( "%s pid[%i]: (VERIFY WARNING) can not restart because 'restart_data_file' does not exist...." % (pname, process_id)) logger.debug( "%s pid[%i]: (VERIFY WARNING) restart_data_file: %s" % (pname, process_id, restart_data_file)) exit() if os.stat(restart_data_file).st_size != checkpointdata[ "targetdirdict"]["size_list"][i]: logger.warning( "%s pid[%i]: (VERIFY WARNING) can not restart because file size does not match" % (pname, process_id)) logger.debug( "%s pid[%i]: (VERIFY WARNING) restart_data_file: %s" % (pname, process_id, restart_data_file)) exit() # # create outputfiles to pass to the input of the next task # (modify checkpoint data path) # ret = 0 chkpointd_outputfiles_list = checkpointdata["outputfiles"] chkpointd_target_dir = checkpointdata["targetdir"] # get each element of the output from the last run. # element is list (multiple files) or string (single file) for x in chkpointd_outputfiles_list: if type(x) is list: new_outputfiles_list2 = [] # combine x elements to create a file path and add it to the list for output_file_path in x: new_file_name = os.path.join( chk_target_dir, output_file_path[len(chkpointd_target_dir) + 1:]) new_file_name = os.path.abspath(new_file_name) new_file_name = os.path.normpath(new_file_name) new_outputfiles_list2.append(new_file_name) new_outputfiles_list.append(new_outputfiles_list2) else: # x is a file path and add it to the list new_file_name = os.path.join( chk_target_dir, x[len(chkpointd_target_dir) + 1:]) new_file_name = os.path.abspath(new_file_name) new_file_name = os.path.normpath(new_file_name) new_outputfiles_list.append(new_file_name) restart_files_list_dict[dict_route[task]] = new_outputfiles_list logger.debug("%s pid[%i]: new_outputfiles_list: %s" % (pname, process_id, new_outputfiles_list)) logger.debug("%s pid[%i]: task: %s" % (pname, process_id, task)) logger.debug("%s pid[%i]: dict_route: %s" % (pname, process_id, dict_route[task])) # relocate files for restart backup_dirs = os.listdir(restartdata_dir + "_" + dict_route[task]) for backup_dir in backup_dirs: src_backup_dir = os.path.join( restartdata_dir + "_" + dict_route[task], backup_dir) args = ["cp", "-lr", src_backup_dir, target_dir] ret = subprocess.call(args) logger.warning( "%s pid[%i]: (WRITE WARNING) restored backup file, subprocess.call retcode %d" % (pname, process_id, ret)) logger.debug("%s pid[%i]: src_backup_dir: %s" % (pname, process_id, src_backup_dir)) elapsed_time = time.time() - elapsed_time logger.info("%s pid[%i]: checkpoint_restart elapsed_time %f[sec]" % (pname, process_id, elapsed_time)) return flag_skiptask, flag_starttask, restart_files_list_dict
def vge_task(arg1,arg2,arg3,arg4,arg5=None): # arg1: command to be performed # arg2: max_task # arg3: basefilename # arg4: parameter dict # involved keys: arguments # verbose --> verbose # host --> host # port --> port # bufsize --> bufsize # nsleep_request --> socket_interval_after # nsleep_interval=0.5 --> socket_interval_request # nsleep_connection_refused --> socket_interval_error # nsleep_aftersend --> socket_interval_send # nsleep_updatelist --> socket_interval_update # nsleep_controller=0.01 --> socket_interval_close # timeout_client=600.0 --> socket_timeout1 # timeout_ask=600.0 --> socket_timeout2 # # arg5: for future use # "multi" --> multi-sample mode # Pipeline sample or process will be considered in vge_task. # this needs more CPU resouces than single-sample mode. # "single" or None --> single-sample mode # # total_time = time.time() # # # return value # # it will be the lastest return code of jobs for new job # and be a response message from VGE for the others. result = None # # socket # host = socket.gethostbyname(socket.gethostname()) port = 8000 bufsize = 16384 # # get current process id # process_id = os.getpid() # # # assign a header name # temp_processname = get_process_name(process_id) temp_processnamelist= temp_processname.split(" ") # pname="VGETASK(Client)" # for temp0 in temp_processnamelist: temp1=os.path.basename(temp0) if "pipeline" in temp1: pname="VGETASK(Pipeline)" break elif temp1 == "vge_connect" : pname="VGETASK(CONNECT)" break elif temp1 == "vge" : pname="VGETASK(VGE)" break # del temp_processname del temp_processnamelist # #pname="VGETASK(Client)" #if "pipeline" in temp_processname: # pname="VGETASK(Pipeline)" #elif "vge_connect" in temp_processnamelist: # pname="VGETASK(CONNECT)" #elif "vge" in temp_processnamelist and not "vge_connect" in temp_processnamelist: # pname="VGETASK(VGE)" #print temp_processname, temp_processnamelist, pname # # # wait time # nsleep_connection_refused=0.0 nsleep_aftersend =0.00 # pause after send nsleep_updatelist=0.00 # nsleep_request=20.0 # pause after asked job status to VGE nsleep_interval=20.0 # pause after job has been submitted. nsleep_controller=1.00 # wait time after connection timed out timeout_client=600.0 timeout_ask=600.0 # # Config # time_config = time.time() current_dir=os.getcwd() port = int(get_vge_conf("socket","port",port)) bufsize = int(get_vge_conf("socket","bufsize",bufsize)) nsleep_interval = get_vge_conf("pipeline","socket_interval_after",nsleep_interval) nsleep_request = get_vge_conf("pipeline","socket_interval_request", nsleep_request) nsleep_connection_refused = get_vge_conf("pipeline","socket_interval_error", nsleep_connection_refused) nsleep_aftersend = get_vge_conf("pipeline","socket_interval_send", nsleep_aftersend) nsleep_updatelist = get_vge_conf("pipeline","socket_interval_update",nsleep_updatelist) nsleep_controller = get_vge_conf("pipeline","socket_interval_close",nsleep_controller) timeout_client = get_vge_conf("pipeline","socket_timeout1", timeout_client) timeout_ask = get_vge_conf("pipeline","socket_timeout2", timeout_ask) time_config = time.time() - time_config # if isinstance(arg4, argparse.Namespace): if arg4.socket_interval_after > 0: nsleep_request = arg4.socket_interval_after if arg4.socket_interval_request > 0: nsleep_interval = arg4.socket_interval_request if arg4.socket_interval_error > 0: nsleep_connection_refused = arg4.socket_interval_error if arg4.socket_interval_send > 0: nsleep_aftersend = arg4.socket_interval_send if arg4.socket_interval_update > 0: nsleep_updatelist = arg4.socket_interval_update if arg4.socket_interval_close > 0: nsleep_controller = arg4.socket_interval_close if arg4.socket_timeout1 > 0: timeout_client = arg4.socket_timeout1 if arg4.socket_timeout2 > 0: timeout_ask = arg4.socket_timeout2 # # verbose level # verbose=0 verbose = int(get_vge_conf("pipeline","verbose",verbose)) from logging import getLogger,StreamHandler,basicConfig,DEBUG,INFO,WARNING,ERROR,CRITICAL logger=getLogger(__name__) if verbose == 0: logger.setLevel(CRITICAL) elif verbose == 1: logger.setLevel(INFO) elif verbose == 3: logger.setLevel(DEBUG) basicConfig(format='%(asctime)s:%(levelname)s:%(message)s') logger.debug("%s pid[%i]: socket host=%s" %(pname,process_id,host)) logger.debug("%s pid[%i]: socket port=%i" %(pname,process_id,port)) logger.debug("%s pid[%i]: verbose level=%i" %(pname,process_id,verbose)) logger.debug("%s pid[%i]: socket buffer size=%i" %(pname,process_id,bufsize)) logger.debug("%s pid[%i]: socket timeout1 (job sending)=%12.5e" %(pname,process_id,timeout_client)) logger.debug("%s pid[%i]: socket timeout2 (job waiting)=%12.5e" %(pname,process_id,timeout_ask)) logger.debug("%s pid[%i]: socket_interval_after=%12.5e" %(pname,process_id, nsleep_interval)) logger.debug("%s pid[%i]: socket_interval_send=%12.5e" %(pname,process_id,nsleep_aftersend)) logger.debug("%s pid[%i]: socket_interval_request=%12.5e" %(pname,process_id, nsleep_request)) logger.debug("%s pid[%i]: socket_interval_error=%12.5e" %(pname,process_id,nsleep_connection_refused)) logger.debug("%s pid[%i]: socket_interval_update=%12.5e" %(pname,process_id, nsleep_updatelist)) logger.debug("%s pid[%i]: socket_interval_close=%12.5e" %(pname,process_id, nsleep_controller)) logger.debug("%s pid[%i]: config: elapsed time=%12.5e" %(pname,process_id, time_config)) # # # set a sampling mode # multi_sampling=True if arg5 is "single" : multi_sampling=False #logger.debug( "Pipeline pid[%i]: Pipeline sampling mode .... single" %(process_id)) elif arg5 is "multi": multi_sampling=True #logger.debug( "Pipeline pid[%i]: Pipeline sampling mode .... multiple" %(process_id)) elif arg5 is "response": multi_sampling=True logger.debug( "%s pid[%i]: Pipeline sampling mode .... none (response)" %(pname,process_id)) # # get a pipeline_parent_pid for multi-sampling or response mode # pipeline_parent_pid=-1 if multi_sampling: temp_pipeline_parent_pid=os.getppid() logger.debug( "%s pid[%i]: parent pid from os.getppid [%i]" %(pname,process_id,temp_pipeline_parent_pid)) try: # from VGE.pipeline_parent_pid import pipeline_parent_pid if temp_pipeline_parent_pid == pipeline_parent_pid: logger.debug("%s pid[%i]: VGE.pipeline_parent_pid [%i] is the same as that [%i] of os.getppid() in this process." %(pname,process_id,pipeline_parent_pid,temp_pipeline_parent_pid)) if process_id == pipeline_parent_pid: logger.info("%s pid[%i]: this is a parent process [id %i]" %(pname,process_id, pipeline_parent_pid)) pass elif pipeline_parent_pid == -1: # parent pid info was not used. pass elif process_id != pipeline_parent_pid: logger.info( "%s pid[%i]: parent process id [%i] (current process is a child)" %(pname,process_id, pipeline_parent_pid)) else: logger.debug( "%s pid[%i]: VGE.pipeline_parent_pid [%i] is not the same as that [%i] of os.getppid() in this process." %(pname,process_id,pipeline_parent_pid,temp_pipeline_parent_pid)) logger.info( "%s pid[%i]: current process [%i] is a main (parent) " %(pname,process_id,pipeline_parent_pid)) except: logger.debug( "%s pid[%i]: VGE.pipeline_parent_pid module was not be loaded beforehand." %(pname,process_id)) #logger.debug( "Pipeline pid[%i]: parent pid has been taken from os.getppid() in the current process." %(process_id)) logger.info( "%s pid[%i]: try to get a Pipeline parent id ...." %(pname,process_id)) logger.info( "%s pid[%i]: get a parent process id from os.getppid()..... [%i] " %(pname,process_id,temp_pipeline_parent_pid)) logger.info( "%s pid[%i]: search Pipeline related process in the master rank..." %(pname,process_id)) # parent_process_name = "" current_process_name = "" parent_process_name = get_process_name(temp_pipeline_parent_pid) current_process_name = get_process_name(process_id) # logger.debug( "%s pid[%i]: check os.getppid() ..... [%i] " %(pname,process_id,temp_pipeline_parent_pid)) logger.debug( "%s pid[%i]: check.... process_name from os.getppid() [%s] " %(pname,process_id,parent_process_name)) logger.debug( "%s pid[%i]: check.... process_name from os.getpid() [%s] " %(pname,process_id,current_process_name)) # pipeline_process_list=[] pipeline_process_list=get_pipeline_process_name_list("master") logger.info("%s pid[%i]: check .... pipeline_process_list [%s]." %(pname,process_id,pipeline_process_list)) # flag_parent=False flag_current=False for pipeline_process_name in pipeline_process_list: #logger.info( "Pipeline pid[%i]: check ...parent_process_name[%s], pipeline_process_name[%s]" %(process_id,parent_process_name,pipeline_process_name)) if pipeline_process_name in parent_process_name: logger.debug("%s pid[%i]: found Pipeline-related name [%s] in a parent process [%s]." %(pname,process_id,pipeline_process_name,parent_process_name)) flag_parent=True break for pipeline_process_name in pipeline_process_list: #logger.info( "Pipeline pid[%i]: check ...currnet_process_name[%s], pipeline_process_name[%s]" %(process_id,current_process_name,pipeline_process_name)) if pipeline_process_name in current_process_name: logger.debug("%s pid[%i]: found Pipeline-related name [%s] in a current process [%s]." %(pname,process_id,pipeline_process_name,current_process_name)) flag_current=True break if flag_parent and flag_current: logger.info( "%s pid[%i]: current process is a child" %(pname,process_id)) logger.info( "%s pid[%i]: parent Pipeline process id is set to be [%i] from os.getppid()." %(pname,process_id,temp_pipeline_parent_pid)) pipeline_parent_pid = temp_pipeline_parent_pid elif not flag_parent and flag_current: logger.info( "%s pid[%i]: this is on a main process." %(pname,process_id)) logger.info( "%s pid[%i]: Pipeline parent process id is set to be [%i] of the current pid." %(pname,process_id,process_id)) pipeline_parent_pid = process_id else: logger.info( "%s pid[%i]: this state should be unknown." %(pname,process_id)) logger.info( "%s pid[%i]: Pipeline process id will not be considered." %(pname,process_id)) pipeline_parent_pid = -1 # # initialize parameters # jobid =0 messeage="" flag_pipeline_task_loop=True finished="False" current_unique_jobid=0 current_vge_jobid = 0 current_max_task = 0 joblist = dict() # current_jobid=None Flag=True retry=1 # current_unique_jobid = -1 current_vge_jobid = -1 current_max_task = -1 # time_jobsend = 0.0 time_jobrecv = 0.0 time_waitsend = 0.0 time_waitrecv = 0.0 time_jobdispatch=0.0 wait_time = 0.0 # # create a new job # joblist[jobid]= dict() joblist[jobid]=make_pipeline_jobstatuslist(0) joblist[jobid]["pipeline_pid"] = process_id joblist[jobid]["pipeline_parent_pid"] = pipeline_parent_pid # # args # # get command command = "" if isinstance(arg1,str): if arg1 != "": joblist[jobid]["command"] = arg1 # get max_task max_task=0 if not isinstance(arg2, bool) and isinstance(arg2, int): if arg2 > -1: joblist[jobid]["max_task"] = int(arg2) # get basefilename if isinstance(arg3,str): if arg3 != "": joblist[jobid]["basefilename"] = arg3 # # check a request... # flag_newjob=True # # # # shutdown VGE # if arg1 == "finished": logger.info("%s pid[%i]: shutdown VGE..." %(pname,process_id)) joblist = "finished" flag_newjob=False joblist_shipped = pickle.dumps(joblist) # # shutdown VGE forcibly # elif arg1 == "forcedstop": logger.info("%s pid[%i]: forced to shutdown VGE..." %(pname,process_id)) joblist = "forcedstop" flag_newjob=False joblist_shipped = pickle.dumps(joblist) # # wake VGE up # elif arg1 == "hello": logger.info("%s pid[%i]: start up VGE..." %(pname,process_id)) joblist = "hello" flag_newjob=False # # request a list of Pipeline paretent process id which VGE knows. # elif arg1 == "pipeline_ppid_list": logger.info("%s pid[%i]: request Pipeline parent process id list..." %(pname,process_id)) joblist = "pipeline_ppid_list" flag_newjob=False # # request the rest of jobs to be performed by VGE # elif arg1 == "restofjob": logger.info("%s pid[%i]: request the number of queued jobs which VGE has..." %(pname,process_id)) joblist = "restofjob" flag_newjob=False # # send a wakeup messeage to VGE. # elif arg1 == "hello_from_pipeline": logger.info("%s pid[%i]: send hello from a Pipeline to VGE..." %(pname,process_id)) joblist = dict() joblist["hello_from_pipeline"] = True joblist["pipeline_parent_pid"] = pipeline_parent_pid flag_newjob=False # # mark a pipeline byebye flag bacause a Pipeline was finished. # elif arg1 == "byebye_from_pipeline": logger.info("%s pid[%i]: send byebye from a Pipeline to VGE..." %(pname,process_id)) joblist = dict() joblist["byebye_from_pipeline"] = True joblist["pipeline_parent_pid"] = pipeline_parent_pid flag_newjob=False # # package a joblist # if not flag_newjob: joblist_shipped = pickle.dumps(joblist) # # send the job to VGE # time_connection = time.time() while Flag: try: time_jobdispatch=time.time() try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) except Exception,error: logger.info("%s pid[%i]: socket error [%s]." %(pname,process_id,error)) if arg1 == "finished" or retry > 1: sock.settimeout(timeout_client) # # mesg = "" messeage="" # # connect with VGE # if retry==1: pass #logger.info("Pipeline pid(%i): try to connect with VGE." %process_id) # sock.connect((host, port)) logger.info("%s pid[%i]: connection with VGE was established. [access times:%i] " %(pname,process_id,retry)) time_connection = time.time() -time_connection if isinstance(joblist, dict) and flag_newjob: joblist_shipped="" joblist[jobid]["sendvgetime"] = datetime.datetime.now() # delete keys which are not used in VGE temp=joblist[jobid].copy() del temp["vge_jobid"] # joblist_shipped = pickle.dumps(temp) # # send the messeage(order) to VGE # #logger.info("Pipeline pid[%i]: new job has been submitted to VGE. [%s]" %(process_id, pickle.loads(joblist_shipped))) try: time_jobsend = time.time() #logger.info("check...") #logger.info(joblist_shipped) sock.sendall(joblist_shipped) time_jobsend = time.time() - time_jobsend time.sleep(nsleep_aftersend) except socket.error: logger.critical("%s Error!! socket sending was failed. [data size->(%i) byte :(%s)]" %(pname,len(joblist_shipped), pickle.loads(joblist_shipped))) pass #sys.exit() except Exception,error: logger.debug("%s pid[%i]: data send error (%s)" %(pname,process_id, error)) pass # # receive an incoming messeage(the jobid for this order) from VGE # #mesg = sock.recv(bufsize) temp=None mesg="" icount=0 time_jobrecv = time.time() while True: temp = sock.recv(bufsize) icount += 1 mesg += temp if temp == "" or temp is None: break elif temp[-2:] == "\n." or temp[-3:] == "\ns.": break elif temp[-1:] == ".": if mesg[-2:] == "\n." or mesg[-3:] == "\ns." : break time_jobrecv = time.time() - time_jobrecv if isinstance(mesg,str): messeage=None if mesg > 0: logger.info("%s pid[%i]: recieved mesg size=%i [%i recv times]" %(pname,process_id,len(mesg),icount)) logger.debug("%s pid[%i]: recieved mesg %s]" %(pname,process_id,mesg)) try: messeage=pickle.loads(mesg) logger.info("%s pid[%i]: response from VGE: %s" %(pname,process_id, messeage)) except EOFError: pass except ValueError: logger.error("%s pid[%i]: Value error. mesg is not recieved correctly. [bufsize:%i] [mesg:%s]" %(pname,process_id,bufsize,mesg)) except Exception,error: logger.error("%s pid[%i]: exception error.... check [%s]" %(pname,process_id,error)) #try: # messeage=pickle.loads(mesg) # logger.info("Pipeline pid[%i]: response from VGE: %s" %(process_id, messeage)) #except EOFError: # pass # # # package a message from VGE as a return value if not flag_newjob. # if not flag_newjob: if isinstance(messeage, dict) and joblist is "pipeline_ppid_list": result=messeage elif isinstance(messeage, str) and joblist is "restofjob": result=messeage # # check the response # if isinstance(messeage, dict) and flag_newjob: current_unique_jobid = int(messeage["unique_jobid"]) current_vge_jobid = int(messeage["vge_jobid"]) current_max_task = int(joblist[jobid]["max_task"]) check_pipeline_pid = int(messeage["pipeline_pid"]) joblist[jobid]["unique_jobid"] = current_unique_jobid joblist[jobid]["vge_jobid"] = current_vge_jobid # logger.info("%s pid[%i]: assigned: jobid [%i] --> VGE jobid [%i] " \ %(pname,process_id, current_unique_jobid, current_vge_jobid)) logger.debug("Pipeline: job contents: %s" %joblist[jobid]) logger.debug("Pipeline: CHECK : pid for jobid[%i] : pid (%i) <--> pid(%i) replied from VGE" \ %(process_id, current_unique_jobid, check_pipeline_pid)) Flag=False
def pipeline_jobcontroll(cl_args, job_args, total_joblist, new_joblist, task_check, pipeline_parent_pid_list, command_list): # from logging import getLogger, StreamHandler, basicConfig, DEBUG, INFO, WARNING, ERROR, CRITICAL logger = getLogger(__name__) logger.setLevel(INFO) # # set verbose level # verbose = 0 verbose = int(get_vge_conf("vge", "verbose", verbose)) if verbose == 0: logger.setLevel(CRITICAL) elif verbose == 1 or verbose == 2: logger.setLevel(INFO) elif verbose == 3: logger.setLevel(DEBUG) basicConfig(format='%(asctime)s:%(levelname)s:%(message)s') # # initialize... # flag_byebye = False flag_forcedbyebye = False task_check["exit"] = False task_check["forced"] = False task_check["restofjob"] = 0 server_connected = True # command_id = 0 jobid = 0 # is an indivitual job id used in VGE unique_jobid = 0 # is an unique job id for both VGE an Pipeline pipeline_pid_for_job = -1 # in case of the job that is not listed yet. # # mypid = os.getpid() logger.info("VGE(JOB): Pipeline job controller [pid:%i] is running. " % mypid) # #pipeline_parent_pid_list={} # pipeline_parent_pid_list[25434] = True if VGE got a final signal from Pipeline) # num_of_pipeline_sample = 0 unique_pipeline_parent_id = [] # unique_pipeline_parent_id[1] = 25434 # # # communicate with Pipeline(s) # host = socket.gethostbyname(socket.gethostname()) port = 8000 backlog = 128 bufsize = 16384 default_bufsize = bufsize flag_socket_error = False count_socket_error = 0 time_onesocket = 0.0 pipeline_connect_established = 1 flag_dispatch = False # # Config # server_timeout = 600.0 # sec nsleep_aftersend = 0.0 nsleep_updatelist = 0.0 nsleep_afterconnectionerror = 1.0 nsleep_controller = 1.0 # # read from conf # if job_args["flag_vge_conf"]: port = int(get_vge_conf("socket", "port", port)) bufsize = int(get_vge_conf("socket", "bufsize", bufsize)) server_timeout = get_vge_conf("vge", "socket_timeout", server_timeout) nsleep_aftersend = get_vge_conf("vge", "socket_interval_send", nsleep_aftersend) nsleep_updatelist = get_vge_conf("vge", "socket_interval_update", nsleep_updatelist) nsleep_afterconnectionerror = get_vge_conf( "vge", "socket_interval_error", nsleep_afterconnectionerror) nsleep_controller = get_vge_conf("vge", "socket_interval_close", nsleep_controller) # logger.info("VGE(JOB): socket host = %s" % host) logger.info("VGE(JOB): socket port = %i" % port) logger.info("VGE(JOB): socket buffer size = %i" % bufsize) logger.info("VGE(JOB): socket_interval_send=%12.5e" % nsleep_aftersend) logger.info("VGE(JOB): socket_interval_update=%12.5e" % nsleep_updatelist) logger.info("VGE(JOB): socket_interval_close=%12.5e" % nsleep_afterconnectionerror) logger.info("VGE(JOB): socket_timeout=%12.5e" % server_timeout) logger.info("VGE(JOB): socket_interval_close=%12.5e" % nsleep_controller) # while True: server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) inputs = set([server_sock]) if not server_connected: break try: server_sock.bind((host, port)) server_sock.listen(backlog) # if flag_socket_error == True: logger.info("VGE(JOB): ....recovered (%i)" % count_socket_error) flag_socket_error = False count_socket_error = 0 # while True: # # wait for connection with Pipeline(s) # #logger.info("VGE(JOB): try to connect with Pipeline") try: inputready, outputready, exceptready = select.select( inputs, [], [], server_timeout) except Exception, error: logger.debug("VGE(JOB): check. connection error. [%s] " % error) if not inputready: logger.info("VGE(JOB): connection timed out. [%i sec] " % server_timeout) else: pass sum_send_time = 0.0 sum_recv_time = 0.0 socket_count = 0 for sock in inputready: time_onesocket = time.time() flag_dispatch = False if sock is server_sock: client, address = server_sock.accept() inputs.add(client) else: # # get a messeage from Pipeline # socket_count += 1 messeage = "" mesg = "" recv_time = time.time() temp = None # #mesg=sock.recv(bufsize) icount = 0 while True: temp = sock.recv(bufsize) icount += 1 mesg += temp if temp == "" or temp is None: break elif temp[-2:] == "\n." or temp[-3:] == "\ns.": break elif temp[-1:] == ".": if mesg[-2:] == "\n." or mesg[-3:] == "\ns.": break # #logger.info("VGE(JOB): recv mesg size = %i [%s] " %(len(mesg),mesg)) #logger.info("VGE(JOB): recv mesg size=%i" %(len(mesg))) recv_time = time.time() - recv_time sum_recv_time += recv_time if mesg > 0: logger.info( "VGE(JOB): recieved mesg size=%i [%i recv times]" % (len(mesg), icount)) try: messeage = pickle.loads(mesg) except EOFError: pass except ValueError: logger.error( "VGE(JOB): Value error. mesg is not recieved correctly. [bufsize:%i] [mesg:%s]" % (bufsize, mesg)) except Exception, error: logger.error( "VGE(JOB): exception error.... check [%s]" % (error)) # logger.info( "VGE(JOB): connection with Pipeline was established. [%i] " % pipeline_connect_established) pipeline_connect_established += 1 if isinstance(messeage, str): if "finished" in messeage: # the VGE controller will stop. logger.info( "VGE(JOB): got a shutdown signal from Pipeline/VGE_CONNECT." ) mesg = "byebye" flag_byebye = True send_time = time.time() sock.sendall(pickle.dumps(mesg)) send_time = time.time() - send_time sum_send_time += send_time time.sleep(nsleep_aftersend) server_connected = False #server_sock.close() elif "forcedstop" in messeage: # the VGE controller will be force to stop. logger.info( "VGE(JOB): got a forced shutdown signal from Pipeline/VGE_CONNECT." ) mesg = "byebye!!!" flag_byebye = True flag_forcedbyebye = True # task_check[ "forced"] = True # this is a forced shutdown signal for the MPI job controller # send_time = time.time() sock.sendall(pickle.dumps(mesg)) send_time = time.time() - send_time sum_send_time += send_time time.sleep(nsleep_aftersend) server_connected = False #server_sock.close() elif "hello" in messeage: # echo hello logger.info( "VGE(JOB): got a start signal from Pipeline." ) mesg = "hello" send_time = time.time() sock.sendall(pickle.dumps(mesg)) send_time = time.time() - send_time sum_send_time += send_time time.sleep(nsleep_aftersend) elif "pipeline_ppid_list" in messeage: # request Pipeline parent process id list logger.info( "VGE(JOB): request Pipeline parent process id list from vge_connect." ) mesg = "" temp = pipeline_parent_pid_list.copy( ) # this copy is a must because pipeline_parent_pid_list is a multiprocessing module object. send_time = time.time() sock.sendall(pickle.dumps(temp)) send_time = time.time() - send_time del temp sum_send_time += send_time time.sleep(nsleep_aftersend) elif "restofjob" in messeage: # request Pipeline parent process id list logger.info( "VGE(JOB): request the number of rest of jobs to be carried out from VGE_CONNECT" ) mesg = "" send_time = time.time() sock.sendall( pickle.dumps(task_check["restofjob"])) send_time = time.time() - send_time sum_send_time += send_time time.sleep(nsleep_aftersend) elif isinstance(messeage, dict): logger.debug( "VGE(JOB): messeage from Pipeline (%s)" % messeage) # # mark pipeline byebye flag # if messeage.has_key("byebye_from_pipeline"): temp = int(messeage["pipeline_parent_pid"]) if pipeline_parent_pid_list.has_key(temp): logger.info( "VGE(JOB): Pipeline [pid: %i] was completed." % temp) pipeline_parent_pid_list[temp] = True else: logger.info( "VGE(JOB): Pipeline [pid: %i] was not found in pipeline_parent_pid_list [%s]." % (temp, pipeline_parent_pid_list)) # # mark a new pipeline # elif messeage.has_key("hello_from_pipeline"): temp = int(messeage["pipeline_parent_pid"]) if pipeline_parent_pid_list.has_key(temp): logger.info( "VGE(JOB): we know this Pipeline [pid: %i].... hello again?" % (temp)) logger.debug( "VGE(JOB): pipeline_parent_pid_list ...%s" % pipeline_parent_pid_list) else: logger.info( "VGE(JOB): got a new Pipeline parent pid (got a hello pipeline) [%i]." % temp) # # update Pipeline parent list pipeline_parent_pid_list[temp] = False num_of_pipeline_sample += 1 unique_pipeline_parent_id.append(temp) logger.info( "VGE(JOB): [update] current num of parent pids Pipelines are running is %i ." % len(pipeline_parent_pid_list.keys())) # # job status inquiry from Pipeline # elif messeage.has_key("request"): pipeline_pid_for_job = -1 # in case of the job that is not listed yet. check_unique_jobid = int( messeage["request"] ) # this should be unique_jobid check_vge_jobid = int(messeage["vge_jobid"]) check_max_task = int(messeage["max_task"]) check_pipeline_pid = int( messeage["pipeline_pid"]) check_status = "" output = "" output += "VGE(JOB): jobid (%i) max_task(%i) job_check: " \ %(check_unique_jobid, check_max_task) logger.info( "VGE(JOB): request job status [id:%i] from Pipeline [pid:%i]" % (check_unique_jobid, check_pipeline_pid)) # # check whether the job(s) including bulk jobs for this unique jobid were done or not # check_flag = True #output_running ={} #check_return_code = 0 # # for bulk job, the code is the largest one. check_return_code = 0 # for bulk job, returned the last non-zero value if it was found. # check_jobid = int(check_vge_jobid) if check_max_task == 0: check_count = -1 if total_joblist.has_key(check_jobid): flag = True while flag: try: check_job = total_joblist[ check_jobid] flag = False except KeyError: logger.debug( "VGE(JOB): (Key) retry to read data." ) # a little bit too early to check time.sleep(nsleep_updatelist) pass except EOFError: #logger.debug("VGE(JOB): (EOF) retry to read data.") # a little bit too early to check time.sleep(nsleep_updatelist) pass # # check Pipeline process id # pipeline_pid_for_job = check_job[ "pipeline_pid"] logger.debug("VGE(JOB): pid(%i) for job(%i) <--> pid(%i) requested" \ %(pipeline_pid_for_job, check_jobid,check_pipeline_pid) ) #if pipeline_pid_for_job != check_pipeline_pid: # logger.critical("VGE(JOB): Error!!! Error!!! mismatch pid!!! something wrong!!!!") # exit() if check_job[ "status"] == "done" or check_job[ "status"] == "aborted": check_count = 0 temp_check_return_code = int( check_job["return_code"]) if temp_check_return_code != 0: check_return_code = temp_check_return_code logger.warning( "VGE(JOB): job id[%i] has a non-zero return code[%i] " % (check_unique_jobid, check_return_code)) del temp_check_return_code # else: #output_running[check_jobid] = check_job pass # the job is not carried out yet. else: check_count = 0 count = 0 if total_joblist.has_key(check_jobid): while check_max_task > count: check_job = dict() flag = True while flag: try: check_job = total_joblist[ check_jobid] flag = False except KeyError: logger.debug( "VGE(JOB): (Key) retry to read data." ) time.sleep( nsleep_updatelist) pass except EOFError: #logger.debug("VGE(JOB): (EOF) retry to read data.") time.sleep( nsleep_updatelist) pass bulkjob_id = int( check_job["bulkjob_id"]) # # check Pipeline process id # if count == 0: pipeline_pid_for_job = check_job[ "pipeline_pid"] logger.debug("VGE(JOB): pid(%i) for job(%i) <--> pid(%i) requested" \ %(pipeline_pid_for_job, check_jobid,check_pipeline_pid) ) #if pipeline_pid_for_job != check_pipeline_pid: # logger.critical("VGE(JOB): Error!!! mismatch pid 2!!!") # exit() if check_job[ "status"] == "done" or check_job[ "status"] == "aborted": if check_job[ "unique_jobid"] == check_unique_jobid: # check_count += 1 output += "%03i " % bulkjob_id # temp_return_code = int( check_job[ "return_code"]) if temp_return_code != 0: logger.warning( "VGE(JOB): job id[%i] (bulkjob id[%i]) has a non-zero return code[%s] " % (check_unique_jobid, bulkjob_id, temp_return_code)) check_return_code = temp_return_code # else: # this statement should not be passed. logger.critical( "VGE(JOB) something wrong!! check_jobid (%i)" % check_jobid) logger.critical( "VGE(JOB) something wrong!! check_maxtask (%i)" % check_max_task) logger.critical( "VGE(JOB) something wrong!! check_job (%s)" % check_job) for okey, ojob in total_joblist.items( ): print "total_list: jobid(%04i) unique_jobid(%04i) bulk_id(%04i)" \ %(okey, ojob["unique_jobid"], ojob["bulkjob_id"]) time.sleep(30) else: # some jobs are still running. pass # check_jobid += 1 count += 1 if check_count == check_max_task: # all jobs for this unique id output += "\n" logger.debug(output) output = "VGE(JOB): CHECK: job [id:%i] has been completed. [total %i]" % ( check_unique_jobid, check_max_task) logger.info(output) check_status = "done," + str( check_return_code ) # done,0 for normally finished job else: output += "\n" logger.debug(output) output = "VGE(JOB): CHECK: job [id:%i] is not done yet. [total:%i completed:%i queued:%i]" \ %(check_unique_jobid, check_max_task, check_count, (check_max_task - check_count)) logger.info(output) check_status = "wait,0" # # add Pipeline PID for this job check_status += "," + str(pipeline_pid_for_job) # # send jobstatus to Pipeline # logger.info( "VGE(JOB): send messeage to Pipeline (%s)" % check_status) send_time = time.time() sock.sendall(pickle.dumps(check_status)) send_time = time.time() - send_time sum_send_time += send_time flag_dispatch = True time.sleep(nsleep_aftersend) # # new job(s) from Pipeline # else: logger.info( "VGE(JOB): incoming messeage from Pipeline: %s" % messeage) if messeage is not None or messeage != "": messeage[ "unique_jobid"] = unique_jobid # VGE will reply to Pipeline via unique_jobid messeage.update(make_vge_jobstatuslist()) # # update command list # current_unique_jobid = unique_jobid unique_jobid += 1 command_list[command_id] = dict() temp = dict() temp["command"] = messeage["command"] temp["basefilename"] = messeage[ "basefilename"] temp["max_task"] = messeage["max_task"] temp["unique_jobid"] = current_unique_jobid temp["pipeline_pid"] = messeage[ "pipeline_pid"] temp["pipeline_parent_pid"] = messeage[ "pipeline_parent_pid"] # basefilename = messeage["basefilename"] # command_list[command_id] = temp messeage["command_id"] = command_id command_id += 1 del temp del messeage["command"] del messeage["basefilename"] #logger.debug("VGE(JOB): update command_list: %s" %command_list) # # check if pipeline parent pid is new # temp = int(messeage["pipeline_parent_pid"]) if not pipeline_parent_pid_list.has_key( temp): # # update Pipeline parent list # logger.info( "VGE(JOB): a job from new Pipeline [pid %i] has come." % temp) pipeline_parent_pid_list[temp] = False num_of_pipeline_sample += 1 unique_pipeline_parent_id.append(temp) logger.info( "VGE(JOB): [update] num of parent pids Pipelines are running is %i ." % len(pipeline_parent_pid_list. keys())) else: logger.debug( "VGE(JOB): we already know this Pipeline process id [%i]." % temp) # # update new_joblist # start_jobid = jobid if messeage["max_task"] == 0: filename = basefilename + ".sh.0" messeage["filename"] = filename messeage["bulkjob_id"] = 0 # new_joblist[jobid] = dict() new_joblist[jobid] = messeage # total_joblist[jobid] = dict() total_joblist[jobid] = messeage # #time.sleep(nsleep_updatelist) jobid += 1 else: # bulk job number_of_bulkjob = int( messeage["max_task"]) for bulkjob_id in range( number_of_bulkjob): filename = basefilename + ".sh.%i" % bulkjob_id messeage["filename"] = filename messeage["bulkjob_id"] = bulkjob_id new_joblist[jobid] = dict() new_joblist[jobid] = messeage # total_joblist[jobid] = dict() total_joblist[jobid] = messeage # jobid += 1 # answer = dict() answer[ "unique_jobid"] = current_unique_jobid answer["pipeline_pid"] = messeage[ "pipeline_pid"] answer["pipeline_parent_pid"] = messeage[ "pipeline_parent_pid"] answer[ "vge_jobid"] = start_jobid # this is vge_jobid which Pipeline wants to know # # send the unique jobid to the worker # send_time = time.time() sock.sendall(pickle.dumps(answer)) send_time = time.time() - send_time flag_dispatch = True sum_send_time += send_time time.sleep(nsleep_aftersend) logger.info( "VGE(JOB): reply to Pipeline: [%s] " % answer) # # else: # # send the unique jobid to Pipeline # send_time = time.time() sock.sendall(pickle.dumps(unique_jobid)) send_time = time.time() - send_time sum_send_time += send_time time.sleep(nsleep_aftersend) # # close connection with Pipeline # sock.close() inputs.remove(sock) if flag_byebye: break # # time_onesocket = time.time() - time_onesocket if flag_dispatch: if socket_count == 1: logger.info("VGE(JOB): communication with Pipeline: elapsed times -> mesg-recv[%8.3e]/send[%8.3e] 1-socket cycle[%8.3e] sec" \ %(recv_time, send_time, time_onesocket)) # elif socket_count > 1: logger.info("VGE(JOB): communication with %i Pipeline(s) : averaged elapsed times -> mesg-recv[%8.3e]/send[%8.3e] 1-socket cycle[%8.3e] sec" \ %(socket_count, sum_recv_time/socket_count, sum_send_time/socket_count, time_onesocket)) # if flag_byebye: break
def mpi_jobcontroll_worker(cl_args, mpi_args, comm): # # master rank # master_rank = 0 # # MPI message tag # message_tag = 10 # job message from the master forcedstop_tag = 99 # forced stop message from the master # flag_forcedstop = False # # set verbose level # # logger = getLogger(__name__) logger.setLevel(INFO) # verbose = 0 verbose = int(get_vge_conf("vge", "verbose", verbose)) if verbose == 0: logger.setLevel(CRITICAL) elif verbose == 1: logger.setLevel(INFO) elif verbose == 2: logger.setLevel(CRITICAL) elif verbose == 3: logger.setLevel(DEBUG) basicConfig(format='%(asctime)s:%(levelname)s:%(message)s') # myrank = comm.Get_rank() nproc = comm.Get_size() number_of_nproc_digit = int(math.log10(nproc) + 1) myrank_str = str(myrank).zfill(number_of_nproc_digit) status = MPI.Status() # max_sizecommand = 131071 # for generic linux. for Kei, the max size is 262143 bye (less than 256KB). max_sizecommand = int( get_vge_conf("vge", "mpi_command_size", max_sizecommand)) logger.info("VGE(MPI): worker[%s]: max. command size = %i" % (myrank_str, max_sizecommand)) # job status jobstatus = None logger.info("VGE(MPI): MPI worker[%s] is running." % myrank_str) command = "" ret = 0 # return code from subprocess # probe parameters ; not used in workers #number_of_probe = 1000 #nsleep_probe=0.01 # monitoring interval of mpi_test() on worker for killing a job. this is available if monitor_worker_runjob option is on. time_interval_irecv_test = 30.0 time_interval_irecv_test = float( get_vge_conf("vge", "worker_interval_irecv_test", time_interval_irecv_test)) # pause after checking a running job [sec]. this is available if monitor_worker_runjob option is on. time_interval_runjobcheck = 10.0 time_interval_runjobcheck = float( get_vge_conf("vge", "worker_interval_runjobcheck", time_interval_runjobcheck)) # monitoring running job on worker for emergency call from master ? flag_monitor_worker_runjob = False flag_killthejob = False if cl_args.monitor_workerjob: flag_monitor_worker_runjob = True logger.info( "VGE(MPI): MPI worker[%s] will monitor a job when it is running." % myrank_str) logger.info( "VGE(MPI): MPI worker[%s] [job monitoring] worker_interval_irecv_test : [%0.2f sec]." % (myrank_str, time_interval_irecv_test)) logger.info( "VGE(MPI): MPI worker[%s] [job monitoring] worker_interval_runjobcheck: [%0.2f sec]." % (myrank_str, time_interval_runjobcheck)) # # monitor a forced-stop signal of a running job ... # emergency_signal_recv = comm.irecv(source=master_rank, tag=forcedstop_tag) # #////////////////////////// # job loop for worker nodes #////////////////////////// while True: # # request a job and send the previous job status to the master # start_job = time.time() start_time = time.time() #comm.send(jobstatus, dest=0, tag=myrank) comm.send(jobstatus, dest=0, tag=message_tag) end_time = time.time() mpi_send_time = end_time - start_time if jobstatus is not None: logger.info( "VGE(MPI): worker[%s]: send job status to the master: [%s]" % (myrank_str, jobstatus)) # # init # flag_forcedstop = False # # get an order from the master # start_time = time.time() #package = comm.recv(source=0, tag=MPI.ANY_TAG, status=status) package = comm.recv(source=0, tag=message_tag, status=status) end_time = time.time() mpi_recv_time = end_time - start_time command = None filename = None jobid = None bulkjob_id = None if isinstance(package, dict): command = package["command"] filename = package["filename"] jobid = package["jobid"] bulkjob_id = package["bulkjob_id"] if command is not None: logger.info( "VGE(MPI): worker[%s]: got job[id %i] from the master: [%s]" % (myrank_str, jobid, package)) elif isinstance(package, str): if package == "terminate_all_workers": break # terminate worker loop # # execute job command # ret = 0 start_time = None finish_time = None jobstatus = None if command is not None: logger.info("VGE(MPI): worker[%s]: job [id %i] has been started." % (myrank_str, jobid)) start_time = datetime.datetime.now() time_command = time.time() # check command size sizecommand = len(command) flag_execfromscript = False # # check command size # if sizecommand > max_sizecommand: flag_execfromscript = True logger.info("VGE(MPI): worker[%s]: command size [%i bytes] of job [id %i] is large. [thresh=%i]" \ %(myrank_str,sizecommand, jobid, max_sizecommand)) logger.info( "VGE(MPI): worker[%s]: job [id %i] will be executed from script file directly." % (myrank_str, jobid)) logger.info("VGE(MPI): worker[%s]: .... write file -> [%s]" % (myrank_str, filename)) # if cl_args.nowrite_jobscript: logger.info( "VGE(MPI): worker[%s]: but the file will be deleted later because '--nowrite_jobscript' option turns on " % myrank_str) # vge_script_dir = mpi_args["vge_script_dir"] filename_script = vge_script_dir + "/" + filename # time_write2 = time.time() # writefile = open(filename_script, "w") writefile.write(command) writefile.flush() writefile.close() time_write2 = time.time() - time_write2 while True: if os.path.isfile(filename_script): statinfo = os.stat(filename_script) if int(statinfo.st_size) > sizecommand / 2: break else: time.sleep(0.00005) # 50 us wait filename_stdout = vge_script_dir + "/" + filename + ".o" + str( jobid) filename_stderr = vge_script_dir + "/" + filename + ".e" + str( jobid) command = "bash %s > %s 2> %s" % ( filename_script, filename_stdout, filename_stderr) env = os.environ.copy() if bulkjob_id is not None: env["SGE_TASK_ID"] = str(bulkjob_id + 1) # # popen # time_popen = time.time() p = subprocess.Popen(command, shell=True, bufsize=-1, stdin=None, stdout=subprocess.PIPE, \ stderr=subprocess.PIPE, env=env) time_popen = time.time() - time_popen # jobpid = p.pid logger.info("VGE(MPI): worker[%s]: pid of job [id %i] ==> [%i]." % (myrank_str, jobid, jobpid)) # # wait until the job is completed. # wait_count = 0 stdout = "" stderr = "" time_pcomm = time.time() # if flag_monitor_worker_runjob: # time_check = time.time() # flag_running_job = True while flag_running_job: current_interval = time.time() - time_check if current_interval > time_interval_irecv_test: time_check = time.time() # check_recv = emergency_signal_recv.test( ) # req.test() gives a tuple with a mesg but req.Test() logical. logger.debug( "VGE(MPI): worker[%s]: check_recv ... %s " % (myrank_str, check_recv)) # if check_recv[ 0] is True: # master is calling me! kill the job! # mesg = check_recv[1] # get a message logger.debug( "VGE(MPI): worker[%s]: got a forced stop message [%s] from master when job [%i] is running" % (myrank_str, mesg, jobid)) logger.info( "VGE(MPI): worker[%s]: job [%i] (pid [%i]) was aborted. " % (myrank_str, jobid, jobpid)) # try: os.kill( jobpid, signal.SIGTERM ) # note :OS dependent. Should be checked on K. except Exception, error: logger.error( "VGE(MPI): worker[%s]: os.kill error was occured... [%s] " % (myrank_str, error)) pass # flag_forcedstop = True flag_running_job = False break # escape from poll() check loop # if p.poll() is not None: flag_running_job = False break # escape from poll() check loop else: time.sleep(time_interval_runjobcheck) wait_count += 1 # # job was finished. # #(stdout, stderr) = p.communicate() #ret = p.returncode if flag_forcedstop: ret = -99 # stop by emergency call from master stdout = "forced stop \n" else: (stdout, stderr) = p.communicate() ret = p.returncode # # logger.debug( "VGE(MPI): worker[%s]: result of job [%i] ... stdout:[%s] stderr:[%s] ret:[%s] " % (myrank_str, jobid, stdout, stderr, ret)) # logger.info( "VGE(MPI): worker[%s]: wait count of job [id %i] ==> [%i]." % (myrank_str, jobid, wait_count)) # # else: # # get standard and error outputs of the job # (stdout, stderr) = p.communicate() # # get a return code # ret = p.returncode # logger.debug( "VGE(MPI): worker[%s]: result of job [%i] ... stdout:[%s] stderr:[%s] ret:[%s] " % (myrank_str, jobid, stdout, stderr, ret)) # time_pcomm = time.time() - time_pcomm # if stdout is None: stdout = "" if stderr is None: stderr = "" # the overhead for os module was the same as that for subprocess... #os.system("sh ./%s" %filename) time_command = time.time() - time_command finish_time = datetime.datetime.now() logger.info( "VGE(MPI): worker[%s]: job [id %i] is completed with code [%i]." % (myrank_str, jobid, ret)) # jobstatus = {} jobstatus["jobid"] = jobid jobstatus["start_time"] = start_time jobstatus["finish_time"] = finish_time jobstatus["elapsed_time"] = (finish_time - start_time).total_seconds() # # return code (shell) # -15: aborted # 000: normally finished # 002: No such file or directory # 127: command not found # 139: segmentation fault (in shirokane) jobstatus["return_code"] = ret # # if ret < 0: jobstatus["status"] = "aborted" else: jobstatus["status"] = "done" # # write script, standard output, and standard error # time_write = time.time() if cl_args.nowrite_jobscript or flag_execfromscript: pass # skip to write files... # # delte files... # if cl_args.nowrite_jobscript and flag_execfromscript: os.remove(filename_script) os.remove(filename_stdout) os.remove(filename_stderr) else: logger.info("VGE(MPI): worker[%s]: write files -> [%s]" % (myrank_str, filename)) vge_script_dir = mpi_args["vge_script_dir"] # filename_script = vge_script_dir + "/" + filename try: writefile = open(filename_script, "w") writefile.write(command) writefile.close() except Exception, error: logger.error( "VGE(MPI): worker[%s]: write error [script] [%s] [%s] [%s]" % (myrank_str, filename, error, command)) pass # filename_stdout = vge_script_dir + "/" + filename + ".o" + str( jobid) try: writefile.close() writefile = open(filename_stdout, "w") writefile.write(stdout) except Exception, error: logger.error( "VGE(MPI): worker[%s]: write error [stdout] [%s] [%s] [%s]" % (myrank_str, filename, error, stdout)) pass # filename_stderr = vge_script_dir + "/" + filename + ".e" + str( jobid) try: writefile = open(filename_stderr, "w") writefile.write(stderr) writefile.close() except Exception, error: logger.error( "VGE(MPI): worker[%s]: write error [stderr] [%s] [%s] [%s]" % (myrank_str, filename, error, stdout)) pass
def checkpoint_restart(target_dir): elapsed_time = time.time() # init_restart_conf() logger = getLogger(__name__) logger.setLevel(INFO) # verbose = int(get_restart_conf("verbose", 0)) verbose = int(get_vge_conf("vge", "verbose", 0)) if verbose == 0: logger.setLevel(CRITICAL) elif verbose == 1 or verbose == 2: logger.setLevel(INFO) elif verbose == 3: logger.setLevel(DEBUG) basicConfig(format='%(asctime)s:%(levelname)s:%(message)s') pname = "Pipeline(CHECKPOINT RESTART)" process_id = os.getpid() logger.info("%s pid[%i]: checkpoint judgment and reading start task " % (pname, process_id)) # chk_file = int(get_restart_conf("chk_file", 2)) chk_file = int(get_vge_conf("restart", "chk_file", 2)) # common parameters global chk_work_dir global chk_target_dir global chk_checkpoint_dir work_dir_path = os.path.dirname(target_dir) work_dir_path = os.path.abspath(work_dir_path) work_dir_path = os.path.normpath(work_dir_path) checkpoint_dir_path = os.path.join(target_dir, "checkpoint") checkpoint_dir_path = os.path.abspath(checkpoint_dir_path) checkpoint_dir_path = os.path.normpath(checkpoint_dir_path) target_dir_path = os.path.abspath(target_dir) target_dir_path = os.path.normpath(target_dir_path) chk_work_dir = work_dir_path chk_target_dir = target_dir_path chk_checkpoint_dir = checkpoint_dir_path restartdata_dir = os.path.join(chk_checkpoint_dir, "backupdata") restartdata_dir = os.path.abspath(restartdata_dir) restartdata_dir = os.path.normpath(restartdata_dir) # reconst_checkpoint flag_skiptask = {} flag_starttask = {} if not os.path.isdir(chk_checkpoint_dir): logger.info("%s pid[%i]: normal start" % (pname, process_id)) elapsed_time = time.time() - elapsed_time logger.info("%s pid[%i]: checkpoint_restart elapsed_time %f[sec]" % (pname, process_id, elapsed_time)) return flag_skiptask, flag_starttask, {} # checkpoint mstart file masterfile_list = [] # チェックポイントマスタのリスト作成 for filename in os.listdir(chk_checkpoint_dir): basename, ext = os.path.splitext(filename) if ".mst" == ext: masterfile_list.append(filename) # チェックポイントマスタファイルが1つも無ければ # リスタートしない if not masterfile_list: logger.info("%s pid[%i]: not restart, checkpoint start" % (pname, process_id)) elapsed_time = time.time() - elapsed_time logger.info("%s pid[%i]: checkpoint_restart elapsed_time %f[sec]" % (pname, process_id, elapsed_time)) return flag_skiptask, flag_starttask, {} set_task_list = [] set_checkpointfiles = {} dict_route = {} for mstfile in masterfile_list: chk_masterfile = os.path.join(chk_checkpoint_dir, mstfile) logger.debug("%s pid[%i]: checkpoint master file path: %s" % (pname, process_id, chk_masterfile)) if os.path.isfile(chk_masterfile): try: checkpoint_master = open(chk_masterfile, "rb") logger.info( "%s pid[%i]: restart start, open checkpoint master" % (pname, process_id)) tmplist = checkpoint_master.readlines() checkpoint_master.close() except IOError as e: logger.info("%s pid[%i]: error was masterfile open [%s]. " % (pname, process_id, str(e))) exit() else: logger.info("%s pid[%i]: not restart, checkpoint start" % (pname, process_id)) exit() # チェックポイントマスタデータの中身の確認 # マスタデータの第1カラムがチェックポイントを作ったtask # 第2カラムがチェックポイントデータのパス # 第1カラムがfinalならば何もしない for line in tmplist: line_list = line.split(",") task = line_list[0] set_task_list.append(task) # 最初は長さが1の配列なので、このifが無いとelifでエラーが出る if (len(set_task_list) == 1): set_checkpointfiles[task] = [line_list[1].strip()] # 1つ前のTask名が同じなら真 elif (set_task_list[-2] == set_task_list[-1]): set_checkpointfiles[task].append(line_list[1].strip()) else: set_checkpointfiles[task] = [line_list[1].strip()] if set_task_list[-1] != "final": logger.debug("%s pid[%i]: set_task_list: %s" % (pname, process_id, set_task_list)) flag_starttask[set_task_list[-1]] = True # タスク名とルートタグを紐づける辞書を作る # dict_route[task] = マスタファイルのルートタグの部分(basenameの0-10番目までが"checkpoint_") basename, ext = os.path.splitext(mstfile) dict_route[task] = basename[11:] # マスターファイルの情報を元に、flagを設定 for task in set_task_list: # すでにチェックポイントが書かれていたらskip # checkpointtag = チェックポイントファイルのタスク名の部分(basenameの0-10番目までが"checkpoint_") for checkpointpath in set_checkpointfiles[task]: basename, ext = os.path.splitext(os.path.basename(checkpointpath)) checkpointtag = basename[11:] flag_skiptask[checkpointtag] = True # read checkpointdata # verify_checkpoint restart_files_list_dict = {} for task, startflag in flag_starttask.items(): new_outputfiles_list = [] for checkpointpath in set_checkpointfiles[task]: # チェックポイントデータのパスの修正 chkpoint_filename = os.path.basename(checkpointpath) checkpoint_absdir = os.path.abspath(chk_checkpoint_dir) checkpoint_absdir = os.path.normpath(checkpoint_absdir) new_checkpoint_filepath = os.path.join(checkpoint_absdir, chkpoint_filename) new_checkpoint_filepath = os.path.normpath(new_checkpoint_filepath) readfilename = new_checkpoint_filepath checkpointdata = None binalyfile = None # チェックポイントデータのopen try: binalyfile = open(readfilename, "rb") loaded_package = pickle.load(binalyfile) binalyfile.close() checkpointdata = pickle.loads(loaded_package) except IOError as e: exit() # ファイルの整合性チェック if chk_file == 0: pass elif chk_file == 1: restart_input_files_set = set() for root, dirs, files in os.walk(restartdata_dir + "_" + dict_route[task]): target_dirname = root[len(restartdata_dir + "_" + dict_route[task]) + 1:] #if target_dirname is not "": if target_dirname is "": target_dirname = "." #for filename in files: # restart_input_files_set.add(target_dirname + "/" + filename) for filename in files: restart_input_files_set.add(target_dirname + "/" + filename) if not (set(checkpointdata["targetdirdict"]["filename_list"]) <= restart_input_files_set): logger.warning( "%s pid[%i]: (VERIFY WARNING) can not restart because file size mismatch...." % (pname, process_id)) logger.debug( "%s pid[%i]: (VERIFY WARNING) restart_input_files_set: %s" % (pname, process_id, restart_input_files_set)) logger.debug( "%s pid[%i]: (VERIFY WARNING) set(checkpointdata[\"targetdirdict\"][\"filename_list\"]): %s" % (pname, process_id, set(checkpointdata["targetdirdict"]["filename_list"])) ) exit() elif chk_file == 2: for i in range( len(checkpointdata["targetdirdict"]["filename_list"])): restart_data_file = os.path.join( restartdata_dir + "_" + dict_route[task], checkpointdata["targetdirdict"]["filename_list"][i]) if os.path.islink( restart_data_file ) and not os.path.isfile(restart_data_file): # シンボリックリンクのファイルのリンクを修正する current_dir = os.getcwd() current_dir_len = len(current_dir.split("/")) realpath = os.path.realpath(restart_data_file) # カレントディレクトリ部分の名前を置き換える@京向け realpath_split = realpath.split("/") dircount = 0 new_path = "" for dirname in realpath_split: dircount = dircount + 1 if dircount > current_dir_len: new_path = new_path + "/" + dirname new_src_file = os.path.normpath(current_dir + new_path) os.unlink(restart_data_file) os.symlink(new_src_file, restart_data_file) logger.debug( "%s pid[%i]: re symbolic link %s to %s" % (pname, process_id, restartdata_dir, new_src_file)) if not os.path.isfile(restart_data_file): logger.warning( "%s pid[%i]: (VERIFY WARNING) can not restart because 'restart_data_file' does not exist...." % (pname, process_id)) logger.debug( "%s pid[%i]: (VERIFY WARNING) restart_data_file: %s" % (pname, process_id, restart_data_file)) exit() if os.stat(restart_data_file).st_size != checkpointdata[ "targetdirdict"]["size_list"][i]: logger.warning( "%s pid[%i]: (VERIFY WARNING) can not restart because file size does not match" % (pname, process_id)) logger.debug( "%s pid[%i]: (VERIFY WARNING) restart_data_file: %s" % (pname, process_id, restart_data_file)) exit() # 次のタスクの入力に渡すoutputfilesを作る(チェックポイントデータのパスを修正) ret = 0 chkpointd_outputfiles_list = checkpointdata["outputfiles"] chkpointd_target_dir = checkpointdata["targetdir"] # 前回実行時のアウトプットの各要素をみる。要素は、リストの場合(複数ファイル)と文字列の場合(ファイル単体)がある。 for x in chkpointd_outputfiles_list: if type(x) is list: new_outputfiles_list2 = [] # xの各要素が一つのファイルのパス。これらを新しいパスにしてリストに追加 for output_file_path in x: new_file_name = os.path.join( chk_target_dir, output_file_path[len(chkpointd_target_dir) + 1:]) new_file_name = os.path.abspath(new_file_name) new_file_name = os.path.normpath(new_file_name) new_outputfiles_list2.append(new_file_name) new_outputfiles_list.append(new_outputfiles_list2) else: # xは一つのファイルのパス。これを新しいパスにしてリストに追加 new_file_name = os.path.join( chk_target_dir, x[len(chkpointd_target_dir) + 1:]) new_file_name = os.path.abspath(new_file_name) new_file_name = os.path.normpath(new_file_name) new_outputfiles_list.append(new_file_name) restart_files_list_dict[dict_route[task]] = new_outputfiles_list logger.debug("%s pid[%i]: new_outputfiles_list: %s" % (pname, process_id, new_outputfiles_list)) logger.debug("%s pid[%i]: task: %s" % (pname, process_id, task)) logger.debug("%s pid[%i]: dict_route: %s" % (pname, process_id, dict_route[task])) # リスタート用ファイルの再配置 backup_dirs = os.listdir(restartdata_dir + "_" + dict_route[task]) for backup_dir in backup_dirs: src_backup_dir = os.path.join( restartdata_dir + "_" + dict_route[task], backup_dir) args = ["cp", "-lr", src_backup_dir, target_dir] ret = subprocess.call(args) logger.warning("%s pid[%i]: (WRITE WARNING) restored backup file, subprocess.call retcode %d" \ %(pname,process_id,ret)) logger.debug("%s pid[%i]: src_backup_dir: %s" % (pname, process_id, src_backup_dir)) elapsed_time = time.time() - elapsed_time logger.info("%s pid[%i]: checkpoint_restart elapsed_time %f[sec]" % (pname, process_id, elapsed_time)) return flag_skiptask, flag_starttask, restart_files_list_dict