def check_task_status(job_id, component, interval=1): task_id = job_utils.generate_task_id( job_id=job_id, component_name=component.get_name()) while True: try: status_collect = set() parameters = component.get_role_parameters() for _role, _partys_parameters in parameters.items(): for _party_parameters in _partys_parameters: _party_id = _party_parameters.get('local', {}).get('party_id') tasks = query_task(job_id=job_id, task_id=task_id, role=_role, party_id=_party_id) if tasks: task_status = tasks[0].f_status else: task_status = 'notRunning' schedule_logger.info( 'job {} component {} run on {} {} status is {}'. format(job_id, component.get_name(), _role, _party_id, task_status)) status_collect.add(task_status) if 'failed' in status_collect: return False elif len(status_collect) == 1 and 'success' in status_collect: return True else: time.sleep(interval) except Exception as e: schedule_logger.exception(e) return False
def start_task(job_id, component_name, task_id, role, party_id, task_config): schedule_logger.info('job {} {} {} {} task subprocess is ready'.format( job_id, component_name, role, party_id, task_config)) task_process_start_status = False try: task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id), role, party_id, component_name) os.makedirs(task_dir, exist_ok=True) task_config_path = os.path.join(task_dir, 'task_config.json') with open(task_config_path, 'w') as fw: json.dump(task_config, fw) process_cmd = [ 'python3', sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-r', role, '-p', party_id, '-c', task_config_path ] task_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, party_id, component_name) schedule_logger.info( 'job {} {} {} {} task subprocess start'.format( job_id, component_name, role, party_id, task_config)) p = job_utils.run_subprocess(config_dir=task_dir, process_cmd=process_cmd, log_dir=task_log_dir) if p: task_process_start_status = True except Exception as e: schedule_logger.exception(e) finally: schedule_logger.info( 'job {} component {} on {} {} start task subprocess {}'.format( job_id, component_name, role, party_id, 'success' if task_process_start_status else 'failed'))
def clean_job(job_id, role, party_id): schedule_logger.info('job {} on {} {} start to clean'.format(job_id, role, party_id)) tasks = job_utils.query_task(job_id=job_id, role=role, party_id=party_id) for task in tasks: try: Tracking(job_id=job_id, role=role, party_id=party_id, task_id=task.f_task_id).clean_task() schedule_logger.info( 'job {} component {} on {} {} clean done'.format(job_id, task.f_component_name, role, party_id)) except Exception as e: schedule_logger.info( 'job {} component {} on {} {} clean failed'.format(job_id, task.f_component_name, role, party_id)) schedule_logger.exception(e) schedule_logger.info('job {} on {} {} clean done'.format(job_id, role, party_id))
def run(self): if not self.queue.is_ready(): schedule_logger.error('queue is not ready') return False all_jobs = [] while True: try: if len(all_jobs) == self.concurrent_num: for future in as_completed(all_jobs): all_jobs.remove(future) break job_event = self.queue.get_event() schedule_logger.info('schedule job {}'.format(job_event)) future = self.job_executor_pool.submit(DAGScheduler.handle_event, job_event) future.add_done_callback(DAGScheduler.get_result) all_jobs.append(future) except Exception as e: schedule_logger.exception(e)
def kill_job(job_id, role, party_id, job_initiator): schedule_logger.info('{} {} get kill job {} command'.format(role, party_id, job_id)) tasks = job_utils.query_task(job_id=job_id, role=role, party_id=party_id) for task in tasks: kill_status = False try: kill_status = job_utils.kill_process(int(task.f_run_pid)) except Exception as e: schedule_logger.exception(e) finally: schedule_logger.info( 'job {} component {} on {} {} process {} kill {}'.format(job_id, task.f_component_name, task.f_role, task.f_party_id, task.f_run_pid, 'success' if kill_status else 'failed')) if task.f_status != TaskStatus.SUCCESS: task.f_status = TaskStatus.FAILED TaskExecutor.sync_task_status(job_id=job_id, component_name=task.f_component_name, task_id=task.f_task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), task_info=task.to_json())
def run_task(): task = Task() task.f_create_time = current_timestamp() try: parser = argparse.ArgumentParser() parser.add_argument('-j', '--job_id', required=True, type=str, help="job id") parser.add_argument('-n', '--component_name', required=True, type=str, help="component name") parser.add_argument('-t', '--task_id', required=True, type=str, help="task id") parser.add_argument('-r', '--role', required=True, type=str, help="role") parser.add_argument('-p', '--party_id', required=True, type=str, help="party id") parser.add_argument('-c', '--config', required=True, type=str, help="task config") parser.add_argument('--job_server', help="job server", type=str) args = parser.parse_args() schedule_logger.info('enter task process') schedule_logger.info(args) # init function args if args.job_server: RuntimeConfig.init_config( HTTP_PORT=args.job_server.split(':')[1]) job_id = args.job_id component_name = args.component_name task_id = args.task_id role = args.role party_id = int(args.party_id) task_config = file_utils.load_json_conf(args.config) job_parameters = task_config['job_parameters'] job_initiator = task_config['job_initiator'] job_args = task_config['job_args'] task_input_dsl = task_config['input'] task_output_dsl = task_config['output'] parameters = task_config['parameters'] module_name = task_config['module_name'] except Exception as e: schedule_logger.exception(e) task.f_status = TaskStatus.FAILED return try: # init environment, process is shared globally RuntimeConfig.init_config(WORK_MODE=job_parameters['work_mode']) storage.init_storage(job_id=task_id, work_mode=RuntimeConfig.WORK_MODE) federation.init(job_id=task_id, runtime_conf=parameters) job_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, str(party_id)) task_log_dir = os.path.join(job_log_dir, component_name) log_utils.LoggerFactory.set_directory(directory=task_log_dir, parent_log_dir=job_log_dir, append_to_parent_log=True, force=True) task.f_job_id = job_id task.f_component_name = component_name task.f_task_id = task_id task.f_role = role task.f_party_id = party_id task.f_operator = 'python_operator' tracker = Tracking(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, model_id=job_parameters['model_id'], model_version=job_parameters['model_version'], module_name=module_name) task.f_start_time = current_timestamp() task.f_run_ip = get_lan_ip() task.f_run_pid = os.getpid() run_class_paths = parameters.get('CodePath').split('/') run_class_package = '.'.join( run_class_paths[:-2]) + '.' + run_class_paths[-2].replace( '.py', '') run_class_name = run_class_paths[-1] task_run_args = TaskExecutor.get_task_run_args( job_id=job_id, role=role, party_id=party_id, job_parameters=job_parameters, job_args=job_args, input_dsl=task_input_dsl) run_object = getattr(importlib.import_module(run_class_package), run_class_name)() run_object.set_tracker(tracker=tracker) run_object.set_taskid(taskid=task_id) task.f_status = TaskStatus.RUNNING TaskExecutor.sync_task_status(job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get( 'party_id', None), task_info=task.to_json()) schedule_logger.info('run {} {} {} {} {} task'.format( job_id, component_name, task_id, role, party_id)) schedule_logger.info(parameters) schedule_logger.info(task_input_dsl) run_object.run(parameters, task_run_args) if task_output_dsl: if task_output_dsl.get('data', []): output_data = run_object.save_data() tracker.save_output_data_table( output_data, task_output_dsl.get('data')[0]) if task_output_dsl.get('model', []): output_model = run_object.export_model() # There is only one model output at the current dsl version. tracker.save_output_model(output_model, task_output_dsl['model'][0]) task.f_status = TaskStatus.SUCCESS except Exception as e: schedule_logger.exception(e) task.f_status = TaskStatus.FAILED finally: try: task.f_end_time = current_timestamp() task.f_elapsed = task.f_end_time - task.f_start_time task.f_update_time = current_timestamp() TaskExecutor.sync_task_status( job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), task_info=task.to_json()) except Exception as e: schedule_logger.exception(e) schedule_logger.info('finish {} {} {} {} {} {} task'.format( job_id, component_name, task_id, role, party_id, task.f_status)) print('finish {} {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id, task.f_status))
def handle_event(job_event): try: return TaskScheduler.run_job(**job_event) except Exception as e: schedule_logger.exception(e) return False