def create_apg(self): request = CreateAutoProvisioningGroupRequest() request.set_accept_format('json') request.set_TotalTargetCapacity(str(self.nchunks_limit)) request.set_LaunchTemplateId(self.cloud_resources["template_id"]) request.set_AutoProvisioningGroupName( self.cloud_resources["instance_name"] + ''.join(random.choice(string.ascii_uppercase) for _ in range(20))) request.set_AutoProvisioningGroupType("maintain") request.set_SpotAllocationStrategy("lowest-price") request.set_SpotInstanceInterruptionBehavior("terminate") request.set_SpotInstancePoolsToUseCount(1) request.set_ExcessCapacityTerminationPolicy("termination") request.set_TerminateInstances(True) request.set_PayAsYouGoTargetCapacity("0") request.set_SpotTargetCapacity(str(self.nchunks_limit)) config = self.generate_config() request.set_LaunchTemplateConfigs(config) try: response = self.client.do_action_with_exception(request) response = json.loads(response) with open('apg_id.json', 'w') as fp: json.dump({'apg_id': response["AutoProvisioningGroupId"]}, fp, indent=4) return response["AutoProvisioningGroupId"] except ServerException as e: dlog.info("create apg failed, err msg: %s" % e) sys.exit() except ClientException as e: dlog.info("create apg failed, err msg: %s" % e) sys.exit()
def create_template(self, image_id, sg_id, vpc_id): request = CreateLaunchTemplateRequest() request.set_accept_format('json') request.set_LaunchTemplateName(''.join( random.choice(string.ascii_uppercase) for _ in range(20))) request.set_ImageId(image_id) request.set_ImageOwnerAlias("self") request.set_PasswordInherit(True) if "address" in self.cloud_resources and self.cloud_resources[ 'address'] == "public": request.set_InternetMaxBandwidthIn(100) request.set_InternetMaxBandwidthOut(100) request.set_InstanceType("ecs.c6.large") request.set_InstanceName(self.cloud_resources["instance_name"]) request.set_SecurityGroupId(sg_id) request.set_VpcId(vpc_id) request.set_SystemDiskCategory("cloud_efficiency") request.set_SystemDiskSize(70) request.set_IoOptimized("optimized") request.set_InstanceChargeType("PostPaid") request.set_NetworkType("vpc") request.set_SpotStrategy("SpotWithPriceLimit") request.set_SpotPriceLimit(100) try: response = self.client.do_action_with_exception(request) response = json.loads(response) return response["LaunchTemplateId"] except ServerException as e: dlog.info(e) sys.exit() except ClientException as e: dlog.info(e) sys.exit()
def check_status(self, job): job_id = job.job_id if job_id == "" : return JobStatus.unsubmitted ret, stdin, stdout, stderr\ = self.context.block_call ("qstat -x " + job_id) err_str = stderr.read().decode('utf-8') if (ret != 0) : if str("qstat: Unknown Job Id") in err_str or str("Job has finished") in err_str: if self.check_finish_tag(job=job) : return JobStatus.finished else : return JobStatus.terminated else : raise RuntimeError ("status command qstat fails to execute. erro info: %s return code %d" % (err_str, ret)) status_line = stdout.read().decode('utf-8').split ('\n')[-2] status_word = status_line.split ()[-2] # dlog.info (status_word) if status_word in ["Q","H"] : return JobStatus.waiting elif status_word in ["R"] : return JobStatus.running elif status_word in ["C", "E", "K", "F"] : if self.check_finish_tag(job): dlog.info(f"job: {job.job_hash} {job.job_id} finished") return JobStatus.finished else : return JobStatus.terminated else : return JobStatus.unknown
def describe_apg_instances(self): request = DescribeAutoProvisioningGroupInstancesRequest() request.set_accept_format('json') request.set_AutoProvisioningGroupId(self.cloud_resources["apg_id"]) request.set_PageSize(100) iteration = self.nchunks // 100 instance_list = [] for i in range(iteration + 1): request.set_PageNumber(i + 1) count = 0 flag = 0 err_msg = 0 while count < 10: try: response = self.client.do_action_with_exception(request) response = json.loads(response) for ins in response["Instances"]["Instance"]: instance_list.append(ins["InstanceId"]) flag = 1 break except ServerException as e: # dlog.info(e) err_msg = e count += 1 except ClientException as e: # dlog.info(e) err_msg = e count += 1 if not flag: dlog.info("describe_apg_instances failed, err msg: %s" % err_msg) sys.exit() return instance_list
def get_image_id(self, img_name): request = DescribeImagesRequest() request.set_accept_format('json') request.set_ImageOwnerAlias("self") request.set_PageSize(20) response = self.client.do_action_with_exception(request) response = json.loads(response) totalcount = response["TotalCount"] iteration = totalcount // 20 if iteration * 20 < totalcount: iteration += 1 for ii in range(1, iteration + 1): count = 0 flag = 0 request.set_PageNumber(ii) while count < 10: try: response = self.client.do_action_with_exception(request) response = json.loads(response) for img in response["Images"]["Image"]: if img["ImageName"] == img_name: return img["ImageId"] flag = 1 break except: count += 1 time.sleep(10) if not flag: dlog.info("get image failed, exit") sys.exit()
def delete(self, ii): '''delete one machine''' request = DeleteInstancesRequest() request.set_accept_format('json') request.set_InstanceIds( [self.dispatcher_list[ii]["entity"].instance_id]) request.set_Force(True) count = 0 flag = 0 while count < 10: try: response = self.client.do_action_with_exception(request) flag = 1 break except ServerException as e: time.sleep(10) count += 1 if flag: status_list = [ item["dispatcher_status"] for item in self.dispatcher_list ] running_num = status_list.count("running") running_num += status_list.count("unsubmitted") self.change_apg_capasity(running_num) else: dlog.info("delete failed, exit") sys.exit()
def all_finished(self, job_handler, mark_failure, clean=True): task_chunks = job_handler['task_chunks'] task_chunks_str = ['+'.join(ii) for ii in task_chunks] task_hashes = [ sha1(ii.encode('utf-8')).hexdigest() for ii in task_chunks_str ] job_list = job_handler['job_list'] job_record = job_handler['job_record'] command = job_handler['command'] tag_failure_list = [ 'tag_failure_%d' % ii for ii in range(len(command)) ] resources = job_handler['resources'] outlog = job_handler['outlog'] errlog = job_handler['errlog'] backward_task_files = job_handler['backward_task_files'] dlog.debug('checking jobs') nchunks = len(task_chunks) for idx in range(nchunks): cur_hash = task_hashes[idx] rjob = job_list[idx] if not job_record.check_finished(cur_hash): # chunk not finished according to record status = rjob['batch'].check_status() job_uuid = rjob['context'].job_uuid dlog.debug('checked job %s' % job_uuid) if status == JobStatus.terminated: job_record.increase_nfail(cur_hash) if job_record.check_nfail(cur_hash) > 3: raise RuntimeError( 'Job %s failed for more than 3 times' % job_uuid) dlog.info('job %s terminated, submit again' % job_uuid) dlog.debug('try %s times for %s' % (job_record.check_nfail(cur_hash), job_uuid)) rjob['batch'].submit(task_chunks[idx], command, res=resources, outlog=outlog, errlog=errlog, restart=True) elif status == JobStatus.finished: dlog.info('job %s finished' % job_uuid) if mark_failure: rjob['context'].download(task_chunks[idx], tag_failure_list, check_exists=True, mark_failure=False) rjob['context'].download(task_chunks[idx], backward_task_files, check_exists=True) else: rjob['context'].download(task_chunks[idx], backward_task_files) if clean: rjob['context'].clean() job_record.record_finish(cur_hash) job_record.dump() job_record.dump() return job_record.check_all_finished()
def download_(self, job_dirs, remote_down_files, check_exists=False, mark_failure=True, back_error=False): cwd = os.getcwd() for ii in job_dirs: local_job = os.path.join(self.local_root, ii) remote_job = os.path.join(self.remote_root, ii) flist = remote_down_files if back_error: os.chdir(remote_job) flist += glob('error*') os.chdir(cwd) for jj in flist: rfile = os.path.join(remote_job, jj) lfile = os.path.join(local_job, jj) if not os.path.realpath(rfile) == os.path.realpath(lfile): if (not os.path.exists(rfile)) and ( not os.path.exists(lfile)): if check_exists: if mark_failure: with open( os.path.join( self.local_root, ii, 'tag_failure_download_%s' % jj), 'w') as fp: pass else: pass else: raise RuntimeError('do not find download file ' + rfile) elif (not os.path.exists(rfile)) and ( os.path.exists(lfile)): # already downloaded pass elif (os.path.exists(rfile)) and ( not os.path.exists(lfile)): # trivial case, download happily shutil.move(rfile, lfile) elif (os.path.exists(rfile)) and (os.path.exists(lfile)): # both exists, replace! dlog.info('find existing %s, replacing by %s' % (lfile, rfile)) if os.path.isdir(lfile): shutil.rmtree(lfile, ignore_errors=True) elif os.path.isfile(lfile) or os.path.islink(lfile): os.remove(lfile) shutil.move(rfile, lfile) else: raise RuntimeError('should not reach here!') else: # no nothing in the case of linked files pass os.chdir(cwd)
def _rmtree(self, sftp, remotepath, level=0, verbose = False): for f in sftp.listdir_attr(remotepath): rpath = os.path.join(remotepath, f.filename) if stat.S_ISDIR(f.st_mode): self._rmtree(sftp, rpath, level=(level + 1)) else: rpath = os.path.join(remotepath, f.filename) if verbose: dlog.info('removing %s%s' % (' ' * level, rpath)) sftp.remove(rpath) if verbose: dlog.info('removing %s%s' % (' ' * level, remotepath)) sftp.rmdir(remotepath)
def ensure_alive(self, max_check = 10, sleep_time = 10): count = 1 while not self._check_alive(): if count == max_check: raise RuntimeError('cannot connect ssh after %d failures at interval %d s' % (max_check, sleep_time)) dlog.info('connection check failed, try to reconnect to ' + self.remote_root) self._setup_ssh() count += 1 time.sleep(sleep_time)
def _rmtree(self, remotepath, verbose = False): """Remove the remote path.""" # The original implementation method removes files one by one using sftp. # If the latency of the remote server is high, it is very slow. # Thus, it's better to use system's `rm` to remove a directory, which may # save a lot of time. if verbose: dlog.info('removing %s' % remotepath) # In some supercomputers, it's very slow to remove large numbers of files # (e.g. directory containing trajectory) due to bad I/O performance. # So an asynchronously option is provided. self.block_checkcall('rm -rf %s' % remotepath, asynchronously=self.clean_asynchronously)
def check_status(self, job, retry=0, max_retry=3): job_id = job.job_id if job_id == '': return JobStatus.unsubmitted ret, stdin, stdout, stderr \ = self.context.block_call ('squeue -o "%.18i %.2t" -j ' + job_id) if (ret != 0): err_str = stderr.read().decode('utf-8') if str("Invalid job id specified") in err_str: if self.check_finish_tag(job): return JobStatus.finished else: return JobStatus.terminated elif "Socket timed out on send/recv operation" in err_str: # retry 3 times if retry < max_retry: dlog.warning( "Get error code %d in checking status through ssh with job: %s . message: %s" % (ret, job.job_hash, err_str)) dlog.warning("Sleep 60 s and retry checking...") # rest 60s time.sleep(60) return self.check_status(job_id, retry=retry + 1, max_retry=max_retry) else: raise RuntimeError( "status command squeue fails to execute." "job_id:%s \n error message:%s\n return code %d\n" % (job_id, err_str, ret)) status_line = stdout.read().decode('utf-8').split('\n')[-2] status_word = status_line.split()[-1] if not (len(status_line.split()) == 2 and status_word.isupper()): raise RuntimeError("Error in getting job status, " + f"status_line = {status_line}, " + f"parsed status_word = {status_word}") if status_word in ["PD", "CF", "S"]: return JobStatus.waiting elif status_word in ["R"]: return JobStatus.running elif status_word in ["CG"]: return JobStatus.completing elif status_word in [ "C", "E", "K", "BF", "CA", "CD", "F", "NF", "PR", "SE", "ST", "TO" ]: if self.check_finish_tag(job): dlog.info(f"job: {job.job_hash} {job.job_id} finished") return JobStatus.finished else: return JobStatus.terminated else: return JobStatus.unknown
def prepare(self): restart = False if os.path.exists('apg_id.json'): with open('apg_id.json') as fp: apg = json.load(fp) self.cloud_resources["apg_id"] = apg["apg_id"] task_chunks_str = ['+'.join(ii) for ii in self.task_chunks] task_hashes = [ sha1(ii.encode('utf-8')).hexdigest() for ii in task_chunks_str ] for ii in range(self.nchunks): fn = 'jr.%.06d.json' % ii if os.path.exists( os.path.join(os.path.abspath(self.work_path), fn)): cur_hash = task_hashes[ii] job_record = JobRecord(self.work_path, self.task_chunks[ii], fn) if not job_record.check_finished(cur_hash): if not self.check_spot_callback( job_record.record[cur_hash]['context'] ['instance_id']): self.dispatcher_list[ii]["entity"] = Entity( job_record.record[cur_hash]['context']['ip'], job_record.record[cur_hash]['context'] ['instance_id'], job_record) self.make_dispatcher(ii) self.dispatcher_list[ii][ "dispatcher_status"] = "unsubmitted" else: os.remove( os.path.join(os.path.abspath(self.work_path), fn)) else: self.dispatcher_list[ii][ "dispatcher_status"] = "finished" self.server_pool = self.get_server_pool() self.ip_pool = self.get_ip(self.server_pool) restart = True img_id = self.get_image_id(self.cloud_resources["img_name"]) sg_id, vpc_id = self.get_sg_vpc_id() self.cloud_resources["template_id"] = self.create_template( img_id, sg_id, vpc_id) self.cloud_resources["vsw_id"] = self.get_vsw_id(vpc_id) if not restart: dlog.info("begin to create apg") self.cloud_resources["apg_id"] = self.create_apg() time.sleep(120) self.server_pool = self.get_server_pool() self.ip_pool = self.get_ip(self.server_pool) else: dlog.info("restart dispatcher")
def check_status(self, job): job_id = job.job_id # print('shell.check_status.job_id', job_id) # job_state = JobStatus.unknown if job_id == "": return JobStatus.unsubmitted if_job_exists = psutil.pid_exists(pid=job_id) if self.check_finish_tag(job=job): dlog.info(f"job: {job.job_hash} {job.job_id} finished") return JobStatus.finished if if_job_exists: return JobStatus.running else: return JobStatus.terminated
def delete_apg(self): request = DeleteAutoProvisioningGroupRequest() request.set_accept_format('json') request.set_AutoProvisioningGroupId(self.cloud_resources["apg_id"]) request.set_TerminateInstances(True) count = 0 flag = 0 while count < 10: try: response = self.client.do_action_with_exception(request) flag = 1 break except ServerException as e: time.sleep(10) count += 1 if not flag: dlog.info("delete apg failed, exit") sys.exit()
def catch_dispatcher_exception(self, ii): '''everything is okay: return 0 ssh not active : return 1 machine callback : return 2''' if self.check_spot_callback( self.dispatcher_list[ii]["entity"].instance_id): dlog.info("machine %s callback, ip: %s" % (self.dispatcher_list[ii]["entity"].instance_id, self.dispatcher_list[ii]["entity"].ip)) return 2 elif not self.dispatcher_list[ii]["dispatcher"].session._check_alive(): try: self.dispatcher_list[ii]["dispatcher"].session.ensure_alive() return 0 except RuntimeError: return 1 else: return 0
def ensure_alive(self, max_check = 10, sleep_time = 10): count = 1 while not self._check_alive(): if count == max_check: raise RuntimeError('cannot connect ssh after %d failures at interval %d s' % (max_check, sleep_time)) dlog.info('connection check failed, try to reconnect to ' + self.remote_host) self._setup_ssh(hostname=self.remote_host, port=self.remote_port, username=self.remote_uname, password=self.remote_password, key_filename=self.local_key_filename, timeout=self.remote_timeout, passphrase=self.local_key_passphrase) count += 1 time.sleep(sleep_time)
def change_apg_capasity(self, capasity): request = ModifyAutoProvisioningGroupRequest() request.set_accept_format('json') request.set_AutoProvisioningGroupId(self.cloud_resources["apg_id"]) request.set_TotalTargetCapacity(str(capasity)) request.set_SpotTargetCapacity(str(capasity)) request.set_PayAsYouGoTargetCapacity("0") count = 0 flag = 0 while count < 10: try: response = self.client.do_action_with_exception(request) flag = 1 break except: count += 1 time.sleep(10) if not flag: dlog.info("change_apg_capasity failed, exit") sys.exit()
def try_recover_from_json(self): submission_file_name = "{submission_hash}.json".format(submission_hash=self.submission_hash) if_recover = self.machine.context.check_file_exists(submission_file_name) submission = None submission_dict = {} if if_recover : submission_dict_str = self.machine.context.read_file(fname=submission_file_name) submission_dict = json.loads(submission_dict_str) submission = Submission.deserialize(submission_dict=submission_dict) if self == submission: self.belonging_jobs = submission.belonging_jobs self.bind_machine(machine=self.machine) dlog.info(f"Find old submission; recover from json; " f"submission.submission_hash:{submission.submission_hash}; " f"machine.context.remote_root:{self.machine.context.remote_root}; " f"submission.work_base:{submission.work_base};") # self = submission.bind_machine(machine=self.machine) else: print(self.serialize()) print(submission.serialize()) raise RuntimeError("Recover failed.")
def handle_unexpected_job_state(self): job_state = self.job_state if job_state == JobStatus.unknown: raise RuntimeError( "job_state for job {job} is unknown".format(job=self)) if job_state == JobStatus.terminated: dlog.info( f"job: {self.job_hash} {self.job_id} terminated; restarting job" ) if self.fail_count > 3: raise RuntimeError( "job:job {job} failed 3 times".format(job=self)) self.fail_count += 1 self.submit_job() self.get_job_state() if job_state == JobStatus.unsubmitted: dlog.info(f"job: {self.job_hash} unsubmitted; submit it") if self.fail_count > 3: raise RuntimeError( "job:job {job} failed 3 times".format(job=self)) # self.fail_count += 1 self.submit_job() dlog.info("job: {job_hash} submit; job_id is {job_id}".format( job_hash=self.job_hash, job_id=self.job_id))
def check_status(self, job): try: job_id = job.job_id except AttributeError: return JobStatus.terminated if job_id == "": return JobStatus.unsubmitted ret, stdin, stdout, stderr \ = self.context.block_call("bjobs " + job_id) err_str = stderr.read().decode('utf-8') if ("Job <%s> is not found" % job_id) in err_str: if self.check_finish_tag(job): return JobStatus.finished else: return JobStatus.terminated elif ret != 0: raise RuntimeError( "status command bjobs fails to execute.\n error info: %s \nreturn code %d\n" % (err_str, ret)) status_out = stdout.read().decode('utf-8').split('\n') if len(status_out) < 2: return JobStatus.unknown else: status_line = status_out[1] status_word = status_line.split()[2] # ref: https://www.ibm.com/support/knowledgecenter/en/SSETD4_9.1.2/lsf_command_ref/bjobs.1.html if status_word in ["PEND", "WAIT", "PSUSP"]: return JobStatus.waiting elif status_word in ["RUN", "USUSP"]: return JobStatus.running elif status_word in ["DONE", "EXIT"]: if self.check_finish_tag(job): dlog.info(f"job: {job.job_hash} {job.job_id} finished") return JobStatus.finished else: return JobStatus.terminated else: return JobStatus.unknown
def make_dispatcher(mdata, mdata_resource=None, work_path=None, run_tasks=None, group_size=None): if 'cloud_resources' in mdata: if mdata['cloud_resources']['cloud_platform'] == 'ali': from dpdispatcher.ALI import ALI dispatcher = ALI(mdata, mdata_resource, work_path, run_tasks, group_size, mdata['cloud_resources']) dispatcher.init() return dispatcher elif mdata['cloud_resources']['cloud_platform'] == 'ucloud': pass else: hostname = mdata.get('hostname', None) #use_uuid = mdata.get('use_uuid', False) if hostname: context_type = 'ssh' else: context_type = 'local' try: batch_type = mdata['batch'] except: dlog.info( 'cannot find key "batch" in machine file, try to use deprecated key "machine_type"' ) batch_type = mdata['machine_type'] lazy_local = (mdata.get('lazy-local', False)) or (mdata.get( 'lazy_local', False)) if lazy_local and context_type == 'local': dlog.info('Dispatcher switches to the lazy local mode') context_type = 'lazy-local' disp = Dispatcher(mdata, context_type=context_type, batch_type=batch_type) return disp
def submit_jobs(self, resources, command, work_path, tasks, group_size, forward_common_files, forward_task_files, backward_task_files, forward_task_deference=True, outlog='log', errlog='err'): self.backward_task_files = backward_task_files # task_chunks = [ # [os.path.basename(j) for j in tasks[i:i + group_size]] \ # for i in range(0, len(tasks), group_size) # ] task_chunks = _split_tasks(tasks, group_size) task_chunks_str = ['+'.join(ii) for ii in task_chunks] task_hashes = [ sha1(ii.encode('utf-8')).hexdigest() for ii in task_chunks_str ] job_record = JobRecord(work_path, task_chunks, fname=self.jrname) job_record.dump() nchunks = len(task_chunks) job_list = [] for ii in range(nchunks): cur_chunk = task_chunks[ii] cur_hash = task_hashes[ii] if not job_record.check_finished(cur_hash): # chunk is not finished # check if chunk is submitted submitted = job_record.check_submitted(cur_hash) if not submitted: job_uuid = None else: job_uuid = job_record.get_uuid(cur_hash) dlog.debug("load uuid %s for chunk %s" % (job_uuid, cur_hash)) # communication context, bach system context = self.context(work_path, self.session, job_uuid) batch = self.batch(context, uuid_names=self.uuid_names) rjob = {'context': context, 'batch': batch} # upload files if not rjob['context'].check_file_exists( rjob['batch'].upload_tag_name): rjob['context'].upload('.', forward_common_files) rjob['context'].upload(cur_chunk, forward_task_files, dereference=forward_task_deference) rjob['context'].write_file(rjob['batch'].upload_tag_name, '') dlog.debug('uploaded files for %s' % task_chunks_str[ii]) # submit new or recover old submission if not submitted: rjob['batch'].submit(cur_chunk, command, res=resources, outlog=outlog, errlog=errlog) job_uuid = rjob['context'].job_uuid dlog.debug('assigned uuid %s for %s ' % (job_uuid, task_chunks_str[ii])) dlog.info('new submission of %s for chunk %s' % (job_uuid, cur_hash)) else: rjob['batch'].submit(cur_chunk, command, res=resources, outlog=outlog, errlog=errlog, restart=True) dlog.info('restart from old submission %s for chunk %s' % (job_uuid, cur_hash)) # record job and its remote context job_list.append(rjob) ip = None instance_id = None if 'cloud_resources' in self.remote_profile: ip = self.remote_profile['hostname'] instance_id = self.remote_profile['instance_id'] job_record.record_remote_context(cur_hash, context.local_root, context.remote_root, job_uuid, ip, instance_id) job_record.dump() else: # finished job, append a None to list job_list.append(None) assert (len(job_list) == nchunks) job_handler = { 'task_chunks': task_chunks, 'job_list': job_list, 'job_record': job_record, 'command': command, 'resources': resources, 'outlog': outlog, 'errlog': errlog, 'backward_task_files': backward_task_files } return job_handler
def run_submission(self, *, exit_on_submit=False, clean=True): """main method to execute the submission. First, check whether old Submission exists on the remote machine, and try to recover from it. Second, upload the local files to the remote machine where the tasks to be executed. Third, run the submission defined previously. Forth, wait until the tasks in the submission finished and download the result file to local directory. if exit_on_submit is True, submission will exit. """ if not self.belonging_jobs: self.generate_jobs() self.try_recover_from_json() if self.check_all_finished(): dlog.info('info:check_all_finished: True') else: dlog.info('info:check_all_finished: False') self.upload_jobs() self.handle_unexpected_submission_state() self.submission_to_json() time.sleep(1) while not self.check_all_finished(): if exit_on_submit is True: print( '<<<<<<dpdispatcher<<<<<<SuccessSubmit<<<<<<exit 0<<<<<<') print(f"submission succeeded: {self.submission_hash}") print(f"at {self.machine.context.remote_root}") print("exit_on_submit") print( '>>>>>>dpdispatcher>>>>>>SuccessSubmit>>>>>>exit 0>>>>>>') return self.serialize() try: time.sleep(40) except KeyboardInterrupt as e: self.submission_to_json() print( '<<<<<<dpdispatcher<<<<<<KeyboardInterrupt<<<<<<exit 1<<<<<<' ) print('submission: ', self.submission_hash) print(self.serialize()) print( '>>>>>>dpdispatcher>>>>>>KeyboardInterrupt>>>>>>exit 1>>>>>>' ) exit(1) except SystemExit as e: self.submission_to_json() print('<<<<<<dpdispatcher<<<<<<SystemExit<<<<<<exit 2<<<<<<') print('submission: ', self.submission_hash) print(self.serialize()) print('>>>>>>dpdispatcher>>>>>>SystemExit>>>>>>exit 2>>>>>>') exit(2) except Exception as e: self.submission_to_json() print('<<<<<<dpdispatcher<<<<<<{e}<<<<<<exit 3<<<<<<'.format( e=e)) print('submission: ', self.submission_hash) print(self.serialize()) print('>>>>>>dpdispatcher>>>>>>{e}>>>>>>exit 3>>>>>>'.format( e=e)) exit(3) else: self.handle_unexpected_submission_state() finally: pass self.handle_unexpected_submission_state() self.submission_to_json() self.download_jobs() if clean: self.clean_jobs() return self.serialize()
def run_jobs(self, resources, command, work_path, tasks, group_size, forward_common_files, forward_task_files, backward_task_files, forward_task_deference=True, mark_failure=False, outlog='log', errlog='err'): ratio_failure = self.mdata_resources.get("ratio_failue", 0) while True: if self.check_all_dispatchers_finished(ratio_failure): self.clean() break self.exception_handling(ratio_failure) jj = self.nchunks - 1 for ii in range(self.nchunks): dispatcher_status = self.check_dispatcher_status(ii) if dispatcher_status == "unsubmitted": dlog.info(self.dispatcher_list[ii]["entity"].ip) self.dispatcher_list[ii][ "entity"].job_handler = self.dispatcher_list[ ii]["dispatcher"].submit_jobs( resources, command, work_path, self.task_chunks[ii], group_size, forward_common_files, forward_task_files, backward_task_files, forward_task_deference, outlog, errlog) self.dispatcher_list[ii][ "entity"].job_record = self.dispatcher_list[ii][ "entity"].job_handler["job_record"] self.dispatcher_list[ii]["dispatcher_status"] = "running" elif dispatcher_status == "finished" and self.dispatcher_list[ ii]["entity"]: # no jobs in queue, delete current machine # else add current machine to server_pool entity = self.dispatcher_list[ii]["entity"] status_list = [ item["dispatcher_status"] for item in self.dispatcher_list ] flag = "unallocated" in status_list if not flag: self.delete(ii) self.dispatcher_list[ii]["entity"] = None else: self.dispatcher_list[ii]["entity"] = None self.server_pool.append(entity.instance_id) self.ip_pool.append(entity.ip) while (jj >= ii): if (self.dispatcher_list[jj]["dispatcher_status"] == "unallocated"): self.create(jj) if (self.dispatcher_list[jj] ["dispatcher_status"] == "unsubmitted"): dlog.info( self.dispatcher_list[jj]["entity"].ip) self.dispatcher_list[jj][ "entity"].job_handler = self.dispatcher_list[ jj]["dispatcher"].submit_jobs( resources, command, work_path, self.task_chunks[jj], group_size, forward_common_files, forward_task_files, backward_task_files, forward_task_deference, outlog, errlog) self.dispatcher_list[jj][ "entity"].job_record = self.dispatcher_list[ jj]["entity"].job_handler[ "job_record"] self.dispatcher_list[jj][ "dispatcher_status"] = "running" break jj -= 1 elif dispatcher_status == "running": pass elif dispatcher_status == "unallocated": # if len(server_pool) > 0: make_dispatcher # else: pass self.create(ii) if self.dispatcher_list[ii][ "dispatcher_status"] == "unsubmitted": dlog.info(self.dispatcher_list[ii]["entity"].ip) self.dispatcher_list[ii][ "entity"].job_handler = self.dispatcher_list[ii][ "dispatcher"].submit_jobs( resources, command, work_path, self.task_chunks[ii], group_size, forward_common_files, forward_task_files, backward_task_files, forward_task_deference, outlog, errlog) self.dispatcher_list[ii][ "entity"].job_record = self.dispatcher_list[ii][ "entity"].job_handler["job_record"] self.dispatcher_list[ii][ "dispatcher_status"] = "running" elif dispatcher_status == "terminated": pass self.update() time.sleep(10)
def check_finish_tag(self, job): job_tag_finished = job.job_hash + '_job_tag_finished' dlog.info('check if job finished: ', job.job_id, job_tag_finished) return self.context.check_file_exists(job_tag_finished)
def download(self, submission, check_exists=False, mark_failure=True, back_error=False): cwd = os.getcwd() for ii in submission.belonging_tasks: # for ii in job_dirs : local_job = os.path.join(self.local_root, ii.task_work_path) remote_job = os.path.join(self.remote_root, ii.task_work_path) # flist = remote_down_files flist = ii.backward_files if back_error: os.chdir(remote_job) flist += glob('error*') os.chdir(cwd) for jj in flist: rfile = os.path.join(remote_job, jj) lfile = os.path.join(local_job, jj) if not os.path.realpath(rfile) == os.path.realpath(lfile): if (not os.path.exists(rfile)) and ( not os.path.exists(lfile)): if check_exists: if mark_failure: tag_file_path = os.path.join( self.local_root, ii.task_work_path, 'tag_failure_download_%s' % jj) with open(tag_file_path, 'w') as fp: pass else: pass else: raise RuntimeError('do not find download file ' + rfile) elif (not os.path.exists(rfile)) and ( os.path.exists(lfile)): # already downloaded pass elif (os.path.exists(rfile)) and ( not os.path.exists(lfile)): # trivial case, download happily shutil.move(rfile, lfile) elif (os.path.exists(rfile)) and (os.path.exists(lfile)): # both exists, replace! dlog.info('find existing %s, replacing by %s' % (lfile, rfile)) if os.path.isdir(lfile): shutil.rmtree(lfile, ignore_errors=True) elif os.path.isfile(lfile) or os.path.islink(lfile): os.remove(lfile) shutil.copyfile(rfile, lfile) # shutil.move(rfile, lfile) else: raise RuntimeError('should not reach here!') else: # no nothing in the case of linked files pass os.chdir(cwd) # for ii in submission.belonging_tasks: # for ii in job_dirs : # local_job = os.path.join(self.local_root, ii.task_work_path) # remote_job = os.path.join(self.remote_root, ii.task_work_path) # flist = remote_down_files # flist = ii.backward_files local_job = self.local_root remote_job = self.remote_root flist = submission.backward_common_files if back_error: os.chdir(remote_job) flist += glob('error*') os.chdir(cwd) for jj in flist: rfile = os.path.join(remote_job, jj) lfile = os.path.join(local_job, jj) if not os.path.realpath(rfile) == os.path.realpath(lfile): if (not os.path.exists(rfile)) and (not os.path.exists(lfile)): if check_exists: if mark_failure: with open( os.path.join( self.local_root, 'tag_failure_download_%s' % jj), 'w') as fp: pass else: pass else: raise RuntimeError('do not find download file ' + rfile) elif (not os.path.exists(rfile)) and (os.path.exists(lfile)): # already downloaded pass elif (os.path.exists(rfile)) and (not os.path.exists(lfile)): # trivial case, download happily shutil.move(rfile, lfile) elif (os.path.exists(rfile)) and (os.path.exists(lfile)): dlog.info(f"both exist rfile:{rfile}; lfile:{lfile}") # both exists, replace! dlog.info('find existing %s, replacing by %s' % (lfile, rfile)) if os.path.isdir(lfile): shutil.rmtree(lfile, ignore_errors=True) elif os.path.isfile(lfile) or os.path.islink(lfile): os.remove(lfile) shutil.copyfile(rfile, lfile) # shutil.move(rfile, lfile) else: raise RuntimeError('should not reach here!') else: # no nothing in the case of linked files pass os.chdir(cwd)