def login(username, password): global token ret = post( '/account/login', {"username": username, "password": password} ) dlog.debug(f"debug: login ret:{ret}") token = ret['token']
def get_job_state(self): """get the jobs. Usually, this method will query the database of slurm or pbs job scheduler system and get the results. Notes ----- this method will not submit or resubmit the jobs if the job is unsubmitted. """ dlog.debug(f"debug:query database; self.job_hash:{self.job_hash}; self.job_id:{self.job_id}") job_state = self.machine.check_status(self) self.job_state = job_state
def get_submission_state(self): """check whether all the jobs in the submission. Notes ----- this method will not handle unexpected (like resubmit terminated) job state in the submission. """ for job in self.belonging_jobs: job.get_job_state() dlog.debug(f"debug:get_submission_state: job: {job.job_hash}, {job.job_id}, {repr(job.job_state)}")
def sub_script(self, job_dirs, cmd, args, res, outlog, errlog): if args is None: args = [] multi_command = "" for job_dir in job_dirs: for idx, t in enumerate(zip_longest(cmd, args, fillvalue='')): c_str = f"cd {self.context.remote_root}/{job_dir} && ( test -f tag_{idx}_finished || ( ({t[0]} {t[1]} && touch tag_{idx}_finished 2>>{errlog} || exit 52 ) | tee -a {outlog}) ) || exit 51;" multi_command += c_str multi_command += "exit 0;" dlog.debug("10000, %s" % multi_command) return multi_command
def _get_oss_bucket(endpoint, bucket_name): # res = get("/tools/sts_token", {}) res = get("/data/get_sts_token", {}) # print('debug>>>>>>>>>>>>>', res) dlog.debug(f"debug: _get_oss_bucket: res:{res}") auth = oss2.StsAuth( res['AccessKeyId'], res['AccessKeySecret'], res['SecurityToken'] ) return oss2.Bucket(auth, endpoint, bucket_name)
def job_id(self, values): response, jobQueue = values self._job_id = response['jobId'] self._job_name = response['jobName'] self.__class__._jobQueue = jobQueue self.__class__._job_id_map_status[ self._job_id] = self.map_aws_status_to_dpdisp_status( response.get('status', 'SUBMITTED')) self.context.write_file(self.job_id_name, self._job_id) dlog.debug( "15000, _job_id:%s, _job_name:%s, _map:%s, _Queue:%s" % (self._job_id, self._job_name, self.__class__._job_id_map_status, self.__class__._jobQueue))
def check_status(self, job): if job.job_id == '': return JobStatus.unsubmitted dlog.debug( f"debug: check_status; job.job_id:{job.job_id}; job.job_hash:{job.job_hash}" ) try: dp_job_status = api.get_tasks(job.job_id)[0]["status"] except IndexError as e: raise RuntimeError( f"cannot find job information in dpcloudserver's database for job {job.job_id}" ) job_state = self.map_dp_job_state(dp_job_status) return job_state
def __init__(self, local_root, work_profile, job_uuid=None): """ work_profile: local_root: """ assert (type(local_root) == str) self.temp_local_root = os.path.abspath(local_root) self.temp_remote_root = os.path.abspath(work_profile.get_work_root()) self.work_profile = work_profile self.job_uuid = job_uuid self.submission = None # if job_uuid: # self.job_uuid = job_uuid # else: # self.job_uuid = str(uuid.uuid4()) # self.remote_root = os.path.join(work_profile.get_work_root(), self.job_uuid) dlog.debug("local_root is %s" % local_root)
def __init__(self, local_root, work_profile, job_uuid=None): """ work_profile: local_root: """ assert (type(local_root) == str) self.local_root = os.path.abspath(local_root) if job_uuid: self.job_uuid = job_uuid else: self.job_uuid = str(uuid.uuid4()) self.remote_root = os.path.join(work_profile.get_work_root(), self.job_uuid) dlog.debug("local_root is %s" % local_root) dlog.debug("remote_root is %s" % self.remote_root) os.makedirs(self.remote_root, exist_ok=True)
def upload(oss_task_zip, zip_task_file, endpoint, bucket_name): dlog.debug(f"debug: upload: oss_task_zip:{oss_task_zip}; zip_task_file:{zip_task_file}") bucket = _get_oss_bucket(endpoint, bucket_name) total_size = os.path.getsize(zip_task_file) part_size = determine_part_size(total_size, preferred_size=1000 * 1024) upload_id = bucket.init_multipart_upload(oss_task_zip).upload_id parts = [] with open(zip_task_file, 'rb') as fileobj: part_number = 1 offset = 0 while offset < total_size: num_to_upload = min(part_size, total_size - offset) result = bucket.upload_part(oss_task_zip, upload_id, part_number, SizedFileAdapter(fileobj, num_to_upload)) parts.append(PartInfo(part_number, result.etag)) offset += num_to_upload part_number += 1 # result = bucket.complete_multipart_upload(oss_task_zip, upload_id, parts) result = bucket.complete_multipart_upload(oss_task_zip, upload_id, parts) # print('debug:upload_result:', result, dir()) return result
def job_id(self): try: self._job_id except AttributeError: if self.context.check_file_exists(self.job_id_name): self._job_id = self.context.read_file(self.job_id_name) response_list = batch_client.describe_jobs( jobs=[self._job_id]).get('jobs') try: response = response_list[0] jobQueue = response['jobQueue'] except IndexError: pass else: self.job_id = (response, jobQueue) return self._job_id dlog.debug("50000, self._job_id:%s,_Queue:%s,_map:%s," % (self._job_id, self.__class__._jobQueue, self.__class__._job_id_map_status)) return "" return self._job_id
def AWS_check_status(cls, job_id=""): """ to aviod query jobStatus too often, set a time interval query_dict example: {job_id: JobStatus} {'40fb24b2-d0ca-4443-8e3a-c0906ea03622': <JobStatus.running: 3>, '41bda50c-0a23-4372-806c-87d16a680d85': <JobStatus.waiting: 2>} """ query_dict = {} if datetime.now().timestamp() > cls._query_next_allow_time: cls._query_next_allow_time = datetime.now().timestamp( ) + cls._query_time_interval for status in [ 'SUBMITTED', 'PENDING', 'RUNNABLE', 'STARTING', 'RUNNING', 'SUCCEEDED', 'FAILED' ]: nextToken = '' while nextToken is not None: status_response = batch_client.list_jobs( jobQueue=cls._jobQueue, jobStatus=status, maxResults=100, nextToken=nextToken) status_list = status_response.get('jobSummaryList') nextToken = status_response.get('nextToken', None) for job_dict in status_list: cls._job_id_map_status.update({ job_dict['jobId']: cls.map_aws_status_to_dpdisp_status( job_dict['status']) }) dlog.debug('20000:_map: %s' % (cls._job_id_map_status)) dlog.debug('62000:job_id:%s, _query: %s, _map: %s' % (job_id, query_dict, cls._job_id_map_status)) if job_id: return cls._job_id_map_status.get(job_id) return cls._job_id_map_status
def all_finished(self, job_handler, mark_failure, clean=True): task_chunks = job_handler['task_chunks'] task_chunks_str = ['+'.join(ii) for ii in task_chunks] task_hashes = [ sha1(ii.encode('utf-8')).hexdigest() for ii in task_chunks_str ] job_list = job_handler['job_list'] job_record = job_handler['job_record'] command = job_handler['command'] tag_failure_list = [ 'tag_failure_%d' % ii for ii in range(len(command)) ] resources = job_handler['resources'] outlog = job_handler['outlog'] errlog = job_handler['errlog'] backward_task_files = job_handler['backward_task_files'] dlog.debug('checking jobs') nchunks = len(task_chunks) for idx in range(nchunks): cur_hash = task_hashes[idx] rjob = job_list[idx] if not job_record.check_finished(cur_hash): # chunk not finished according to record status = rjob['batch'].check_status() job_uuid = rjob['context'].job_uuid dlog.debug('checked job %s' % job_uuid) if status == JobStatus.terminated: job_record.increase_nfail(cur_hash) if job_record.check_nfail(cur_hash) > 3: raise RuntimeError( 'Job %s failed for more than 3 times' % job_uuid) dlog.info('job %s terminated, submit again' % job_uuid) dlog.debug('try %s times for %s' % (job_record.check_nfail(cur_hash), job_uuid)) rjob['batch'].submit(task_chunks[idx], command, res=resources, outlog=outlog, errlog=errlog, restart=True) elif status == JobStatus.finished: dlog.info('job %s finished' % job_uuid) if mark_failure: rjob['context'].download(task_chunks[idx], tag_failure_list, check_exists=True, mark_failure=False) rjob['context'].download(task_chunks[idx], backward_task_files, check_exists=True) else: rjob['context'].download(task_chunks[idx], backward_task_files) if clean: rjob['context'].clean() job_record.record_finish(cur_hash) job_record.dump() job_record.dump() return job_record.check_all_finished()
def do_submit(self, job_dirs, cmd, args=None, res=None, outlog='log', errlog='err'): res = self.default_resources(res) dlog.debug("2000, params=(%s, %s, %s, %s, %s, %s, )" % (job_dirs, cmd, args, res, outlog, errlog)) dlog.debug( '2200, self.context.remote_root: %s , self.context.local_root: %s' % (self.context.remote_root, self.context.local_root)) # concreate_command = script_str = self.sub_script(job_dirs, cmd, args=args, res=res, outlog=outlog, errlog=errlog) dlog.debug('2300, script_str: %s, self.sub_script_name: %s' % (script_str, self.sub_script_name)) """ jobName example: home-ec2-user-Ag_init-run_gen-iter_000000-01_model_devi-task_000_000048 """ jobName = os.path.join(self.context.remote_root, job_dirs.pop())[1:].replace('/', '-').replace( '.', '_') jobName += ("_" + str(self.context.job_uuid)) response = batch_client.submit_job( jobName=jobName, jobQueue=res['jobQueue'], jobDefinition=res['jobDefinition'], parameters={'task_command': script_str}, containerOverrides={ 'vcpus': res['cpu_num'], 'memory': res['memory_size'] }) dlog.debug('4000, response:%s' % response) self.job_id = (response, res['jobQueue'])
def submit_jobs(self, resources, command, work_path, tasks, group_size, forward_common_files, forward_task_files, backward_task_files, forward_task_deference=True, outlog='log', errlog='err'): self.backward_task_files = backward_task_files # task_chunks = [ # [os.path.basename(j) for j in tasks[i:i + group_size]] \ # for i in range(0, len(tasks), group_size) # ] task_chunks = _split_tasks(tasks, group_size) task_chunks_str = ['+'.join(ii) for ii in task_chunks] task_hashes = [ sha1(ii.encode('utf-8')).hexdigest() for ii in task_chunks_str ] job_record = JobRecord(work_path, task_chunks, fname=self.jrname) job_record.dump() nchunks = len(task_chunks) job_list = [] for ii in range(nchunks): cur_chunk = task_chunks[ii] cur_hash = task_hashes[ii] if not job_record.check_finished(cur_hash): # chunk is not finished # check if chunk is submitted submitted = job_record.check_submitted(cur_hash) if not submitted: job_uuid = None else: job_uuid = job_record.get_uuid(cur_hash) dlog.debug("load uuid %s for chunk %s" % (job_uuid, cur_hash)) # communication context, bach system context = self.context(work_path, self.session, job_uuid) batch = self.batch(context, uuid_names=self.uuid_names) rjob = {'context': context, 'batch': batch} # upload files if not rjob['context'].check_file_exists( rjob['batch'].upload_tag_name): rjob['context'].upload('.', forward_common_files) rjob['context'].upload(cur_chunk, forward_task_files, dereference=forward_task_deference) rjob['context'].write_file(rjob['batch'].upload_tag_name, '') dlog.debug('uploaded files for %s' % task_chunks_str[ii]) # submit new or recover old submission if not submitted: rjob['batch'].submit(cur_chunk, command, res=resources, outlog=outlog, errlog=errlog) job_uuid = rjob['context'].job_uuid dlog.debug('assigned uuid %s for %s ' % (job_uuid, task_chunks_str[ii])) dlog.info('new submission of %s for chunk %s' % (job_uuid, cur_hash)) else: rjob['batch'].submit(cur_chunk, command, res=resources, outlog=outlog, errlog=errlog, restart=True) dlog.info('restart from old submission %s for chunk %s' % (job_uuid, cur_hash)) # record job and its remote context job_list.append(rjob) ip = None instance_id = None if 'cloud_resources' in self.remote_profile: ip = self.remote_profile['hostname'] instance_id = self.remote_profile['instance_id'] job_record.record_remote_context(cur_hash, context.local_root, context.remote_root, job_uuid, ip, instance_id) job_record.dump() else: # finished job, append a None to list job_list.append(None) assert (len(job_list) == nchunks) job_handler = { 'task_chunks': task_chunks, 'job_list': job_list, 'job_record': job_record, 'command': command, 'resources': resources, 'outlog': outlog, 'errlog': errlog, 'backward_task_files': backward_task_files } return job_handler
def submit(self, job_dirs, cmd, args = None, res = None, restart = False, outlog = 'log', errlog = 'err'): if restart: dlog.debug('restart task') status = self.check_status() if status in [ JobStatus.unsubmitted, JobStatus.unknown, JobStatus.terminated ]: dlog.debug('task restart point !!!') self.do_submit(job_dirs, cmd, args, res, outlog=outlog, errlog=errlog) elif status==JobStatus.waiting: dlog.debug('task is waiting') elif status==JobStatus.running: dlog.debug('task is running') elif status==JobStatus.finished: dlog.debug('task is finished') else: raise RuntimeError('unknow job status, must be wrong') else: dlog.debug('new task') self.do_submit(job_dirs, cmd, args, res, outlog=outlog, errlog=errlog) if res is None: sleep = 0 else: sleep = res.get('submit_wait_time', 0) time.sleep(sleep) # For preventing the crash of the tasks while submitting
def download(oss_file, save_file, endpoint, bucket_name): bucket = _get_oss_bucket(endpoint, bucket_name) dlog.debug(f"debug: download: oss_file:{oss_file}; save_file:{save_file}") bucket.get_object_to_file(oss_file, save_file) return save_file