def run_task(job_id, component_name, task_id, role, party_id, task_config): schedule_logger(job_id).info( 'job {} {} {} {} task subprocess is ready'.format(job_id, component_name, role, party_id, task_config)) task_process_start_status = False try: task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id), role, party_id, component_name) os.makedirs(task_dir, exist_ok=True) task_config_path = os.path.join(task_dir, 'task_config.json') with open(task_config_path, 'w') as fw: json.dump(task_config, fw) try: backend = task_config['job_parameters']['backend'] except KeyError: backend = 0 schedule_logger(job_id).warning("failed to get backend, set as 0") backend = Backend(backend) if backend.is_eggroll(): process_cmd = [ 'python3', sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-r', role, '-p', party_id, '-c', task_config_path, '--processors_per_node', str(task_config['job_parameters'].get("processors_per_node", 0)), '--job_server', '{}:{}'.format(get_lan_ip(), HTTP_PORT), ] elif backend.is_spark(): if "SPARK_HOME" not in os.environ: raise EnvironmentError("SPARK_HOME not found") spark_home = os.environ["SPARK_HOME"] # additional configs spark_submit_config = task_config['job_parameters'].get("spark_submit_config", dict()) deploy_mode = spark_submit_config.get("deploy-mode", "client") if deploy_mode not in ["client"]: raise ValueError(f"deploy mode {deploy_mode} not supported") spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit") process_cmd = [spark_submit_cmd, f'--name={task_id}#{role}'] for k, v in spark_submit_config.items(): if k != "conf": process_cmd.append(f'--{k}={v}') if "conf" in spark_submit_config: for ck, cv in spark_submit_config["conf"].items(): process_cmd.append(f'--conf') process_cmd.append(f'{ck}={cv}') process_cmd.extend([ sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-r', role, '-p', party_id, '-c', task_config_path, '--job_server', '{}:{}'.format(get_lan_ip(), HTTP_PORT), ]) else: raise ValueError(f"${backend} supported") task_log_dir = os.path.join(job_utils.get_job_log_directory(job_id=job_id), role, party_id, component_name) schedule_logger(job_id).info( 'job {} {} {} {} task subprocess start'.format(job_id, component_name, role, party_id, task_config)) p = job_utils.run_subprocess(config_dir=task_dir, process_cmd=process_cmd, log_dir=task_log_dir) if p: task_process_start_status = True except Exception as e: schedule_logger(job_id).exception(e) finally: schedule_logger(job_id).info( 'job {} component {} on {} {} start task subprocess {}'.format(job_id, component_name, role, party_id, 'success' if task_process_start_status else 'failed'))
def start_task(job_id, component_name, task_id, role, party_id, task_config): schedule_logger(job_id).info( 'job {} {} {} {} task subprocess is ready'.format( job_id, component_name, role, party_id, task_config)) task_process_start_status = False try: task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id), role, party_id, component_name) os.makedirs(task_dir, exist_ok=True) task_config_path = os.path.join(task_dir, 'task_config.json') with open(task_config_path, 'w') as fw: json.dump(task_config, fw) try: backend = task_config['job_parameters']['backend'] except KeyError: backend = 0 schedule_logger(job_id).warning( "failed to get backend, set as 0") backend = Backend(backend) if backend.is_eggroll() or backend.is_eggroll2(): process_cmd = [ 'python3', sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-r', role, '-p', party_id, '-c', task_config_path, '--job_server', '{}:{}'.format(task_config['job_server']['ip'], task_config['job_server']['http_port']), ] elif backend.is_spark(): if "SPARK_HOME" not in os.environ: raise EnvironmentError("SPARK_HOME not found") spark_submit_config = task_config['job_parameters'].get( "spark_submit_config", dict()) deploy_mode = spark_submit_config.get("deploy-mode", "client") queue = spark_submit_config.get("queue", "default") driver_memory = spark_submit_config.get("driver-memory", "1g") num_executors = spark_submit_config.get("num-executors", 2) executor_memory = spark_submit_config.get( "executor-memory", "1g") executor_cores = spark_submit_config.get("executor-cores", 1) if deploy_mode not in ["client"]: raise ValueError( f"deploy mode {deploy_mode} not supported") spark_home = os.environ["SPARK_HOME"] spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit") process_cmd = [ spark_submit_cmd, f'--name={task_id}#{role}', f'--deploy-mode={deploy_mode}', f'--queue={queue}', f'--driver-memory={driver_memory}', f'--num-executors={num_executors}', f'--executor-memory={executor_memory}', f'--executor-cores={executor_cores}', sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-r', role, '-p', party_id, '-c', task_config_path, '--job_server', '{}:{}'.format(task_config['job_server']['ip'], task_config['job_server']['http_port']), ] else: raise ValueError(f"${backend} supported") task_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, party_id, component_name) schedule_logger(job_id).info( 'job {} {} {} {} task subprocess start'.format( job_id, component_name, role, party_id, task_config)) p = job_utils.run_subprocess(config_dir=task_dir, process_cmd=process_cmd, log_dir=task_log_dir) if p: task_process_start_status = True except Exception as e: schedule_logger(job_id).exception(e) finally: schedule_logger(job_id).info( 'job {} component {} on {} {} start task subprocess {}'.format( job_id, component_name, role, party_id, 'success' if task_process_start_status else 'failed'))