def resource_for_task(cls, task_info, operation_type): cores_per_task, memory_per_task = cls.calculate_task_resource( task_info=task_info) if cores_per_task or memory_per_task: filters, updates = cls.update_resource_sql( resource_model=Job, cores=cores_per_task, memory=memory_per_task, operation_type=operation_type, ) filters.append(Job.f_job_id == task_info["job_id"]) filters.append(Job.f_role == task_info["role"]) filters.append(Job.f_party_id == task_info["party_id"]) filters.append(Job.f_resource_in_use == True) operate = Job.update(updates).where(*filters) operate_status = operate.execute() > 0 else: operate_status = True if operate_status: schedule_logger(job_id=task_info["job_id"]).info( "task {} {} {} resource successfully".format( task_info["task_id"], task_info["task_version"], operation_type)) else: schedule_logger(job_id=task_info["job_id"]).warning( "task {} {} {} resource failed".format( task_info["task_id"], task_info["task_version"], operation_type)) return operate_status
def resource_for_job(cls, job_id, role, party_id, operation_type): operate_status = False engine_name, cores, memory = cls.calculate_job_resource(job_id=job_id, role=role, party_id=party_id) try: with DB.atomic(): updates = { Job.f_engine_type: EngineType.COMPUTING, Job.f_engine_name: engine_name, Job.f_cores: cores, Job.f_memory: memory, } filters = [ Job.f_job_id == job_id, Job.f_role == role, Job.f_party_id == party_id, ] if operation_type == ResourceOperation.APPLY: updates[Job.f_remaining_cores] = cores updates[Job.f_remaining_memory] = memory updates[Job.f_resource_in_use] = True updates[Job.f_apply_resource_time] = base_utils.current_timestamp() filters.append(Job.f_resource_in_use == False) elif operation_type == ResourceOperation.RETURN: updates[Job.f_resource_in_use] = False updates[Job.f_return_resource_time] = base_utils.current_timestamp() filters.append(Job.f_resource_in_use == True) operate = Job.update(updates).where(*filters) record_status = operate.execute() > 0 if not record_status: raise RuntimeError(f"record job {job_id} resource {operation_type} failed on {role} {party_id}") filters, updates = cls.update_resource_sql(resource_model=EngineRegistry, cores=cores, memory=memory, operation_type=operation_type, ) filters.append(EngineRegistry.f_engine_type == EngineType.COMPUTING) filters.append(EngineRegistry.f_engine_name == engine_name) operate = EngineRegistry.update(updates).where(*filters) apply_status = operate.execute() > 0 if not apply_status: raise RuntimeError( f"{operation_type} resource from engine {engine_name} for job {job_id} resource {operation_type} failed on {role} {party_id}") operate_status = True except Exception as e: schedule_logger(job_id=job_id).warning(e) schedule_logger(job_id=job_id).warning( f"{operation_type} job {job_id} resource(cores {cores} memory {memory}) on {role} {party_id} failed") operate_status = False finally: remaining_cores, remaining_memory = cls.get_remaining_resource(EngineRegistry, [ EngineRegistry.f_engine_type == EngineType.COMPUTING, EngineRegistry.f_engine_name == engine_name]) operate_msg = "successfully" if operate_status else "failed" schedule_logger(job_id=job_id).info( f"{operation_type} job {job_id} resource(cores {cores} memory {memory}) on {role} {party_id} {operate_msg}, remaining cores: {remaining_cores} remaining memory: {remaining_memory}") return operate_status
def end_scheduling_updates(cls, job_id): operate = Job.update({ Job.f_end_scheduling_updates: Job.f_end_scheduling_updates + 1 }).where( Job.f_job_id == job_id, Job.f_end_scheduling_updates < END_STATUS_JOB_SCHEDULING_UPDATES) update_status = operate.execute() > 0 return update_status
def end_scheduling_updates(cls, job_id): operate = Job.update({ Job.f_end_scheduling_updates: Job.f_end_scheduling_updates + 1 }).where( Job.f_job_id == job_id, Job.f_end_scheduling_updates < JobDefaultConfig.end_status_job_scheduling_updates) update_status = operate.execute() > 0 return update_status
def rerun_signal(cls, job_id, set_or_reset: bool): if set_or_reset is True: update_fields = {Job.f_rerun_signal: True, Job.f_cancel_signal: False, Job.f_end_scheduling_updates: 0} elif set_or_reset is False: update_fields = {Job.f_rerun_signal: False} else: raise RuntimeError(f"can not support rereun signal {set_or_reset}") update_status = Job.update(update_fields).where(Job.f_job_id == job_id).execute() > 0 return update_status
def ready_signal(cls, job_id, set_or_reset: bool, ready_timeout_ttl=None): filters = [Job.f_job_id == job_id] if set_or_reset: update_fields = {Job.f_ready_signal: True, Job.f_ready_time: current_timestamp()} filters.append(Job.f_ready_signal == False) else: update_fields = {Job.f_ready_signal: False, Job.f_ready_time: None} filters.append(Job.f_ready_signal == True) if ready_timeout_ttl: filters.append(current_timestamp() - Job.f_ready_time > ready_timeout_ttl) update_status = Job.update(update_fields).where(*filters).execute() > 0 return update_status
def cancel_signal(cls, job_id, set_or_reset: bool): update_status = Job.update({ Job.f_cancel_signal: set_or_reset, Job.f_cancel_time: current_timestamp() }).where(Job.f_job_id == job_id).execute() > 0 return update_status