Exemple #1
0
    def run_task(job_id, component_name, task_id, role, party_id, task_config):
        schedule_logger(job_id).info(
            'job {} {} {} {} task subprocess is ready'.format(job_id, component_name, role, party_id, task_config))
        task_process_start_status = False
        try:
            task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id), role, party_id, component_name)
            os.makedirs(task_dir, exist_ok=True)
            task_config_path = os.path.join(task_dir, 'task_config.json')
            with open(task_config_path, 'w') as fw:
                json.dump(task_config, fw)

            try:
                backend = task_config['job_parameters']['backend']
            except KeyError:
                backend = 0
                schedule_logger(job_id).warning("failed to get backend, set as 0")

            backend = Backend(backend)

            if backend.is_eggroll():
                process_cmd = [
                    'python3', sys.modules[TaskExecutor.__module__].__file__,
                    '-j', job_id,
                    '-n', component_name,
                    '-t', task_id,
                    '-r', role,
                    '-p', party_id,
                    '-c', task_config_path,
                    '--processors_per_node', str(task_config['job_parameters'].get("processors_per_node", 0)),
                    '--job_server', '{}:{}'.format(get_lan_ip(), HTTP_PORT),
                ]
            elif backend.is_spark():
                if "SPARK_HOME" not in os.environ:
                    raise EnvironmentError("SPARK_HOME not found")
                spark_home = os.environ["SPARK_HOME"]

                # additional configs
                spark_submit_config = task_config['job_parameters'].get("spark_submit_config", dict())

                deploy_mode = spark_submit_config.get("deploy-mode", "client")
                if deploy_mode not in ["client"]:
                    raise ValueError(f"deploy mode {deploy_mode} not supported")

                spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit")
                process_cmd = [spark_submit_cmd, f'--name={task_id}#{role}']
                for k, v in spark_submit_config.items():
                    if k != "conf":
                        process_cmd.append(f'--{k}={v}')
                if "conf" in spark_submit_config:
                    for ck, cv in spark_submit_config["conf"].items():
                        process_cmd.append(f'--conf')
                        process_cmd.append(f'{ck}={cv}')
                process_cmd.extend([
                    sys.modules[TaskExecutor.__module__].__file__,
                    '-j', job_id,
                    '-n', component_name,
                    '-t', task_id,
                    '-r', role,
                    '-p', party_id,
                    '-c', task_config_path,
                    '--job_server',
                    '{}:{}'.format(get_lan_ip(), HTTP_PORT),
                ])
            else:
                raise ValueError(f"${backend} supported")

            task_log_dir = os.path.join(job_utils.get_job_log_directory(job_id=job_id), role, party_id, component_name)
            schedule_logger(job_id).info(
                'job {} {} {} {} task subprocess start'.format(job_id, component_name, role, party_id, task_config))
            p = job_utils.run_subprocess(config_dir=task_dir, process_cmd=process_cmd, log_dir=task_log_dir)
            if p:
                task_process_start_status = True
        except Exception as e:
            schedule_logger(job_id).exception(e)
        finally:
            schedule_logger(job_id).info(
                'job {} component {} on {} {} start task subprocess {}'.format(job_id, component_name, role, party_id,
                                                                               'success' if task_process_start_status else 'failed'))
Exemple #2
0
    def start_task(job_id, component_name, task_id, role, party_id,
                   task_config):
        schedule_logger(job_id).info(
            'job {} {} {} {} task subprocess is ready'.format(
                job_id, component_name, role, party_id, task_config))
        task_process_start_status = False
        try:
            task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id),
                                    role, party_id, component_name)
            os.makedirs(task_dir, exist_ok=True)
            task_config_path = os.path.join(task_dir, 'task_config.json')
            with open(task_config_path, 'w') as fw:
                json.dump(task_config, fw)

            try:
                backend = task_config['job_parameters']['backend']
            except KeyError:
                backend = 0
                schedule_logger(job_id).warning(
                    "failed to get backend, set as 0")

            backend = Backend(backend)

            if backend.is_eggroll() or backend.is_eggroll2():
                process_cmd = [
                    'python3',
                    sys.modules[TaskExecutor.__module__].__file__,
                    '-j',
                    job_id,
                    '-n',
                    component_name,
                    '-t',
                    task_id,
                    '-r',
                    role,
                    '-p',
                    party_id,
                    '-c',
                    task_config_path,
                    '--job_server',
                    '{}:{}'.format(task_config['job_server']['ip'],
                                   task_config['job_server']['http_port']),
                ]
            elif backend.is_spark():
                if "SPARK_HOME" not in os.environ:
                    raise EnvironmentError("SPARK_HOME not found")
                spark_submit_config = task_config['job_parameters'].get(
                    "spark_submit_config", dict())
                deploy_mode = spark_submit_config.get("deploy-mode", "client")
                queue = spark_submit_config.get("queue", "default")
                driver_memory = spark_submit_config.get("driver-memory", "1g")
                num_executors = spark_submit_config.get("num-executors", 2)
                executor_memory = spark_submit_config.get(
                    "executor-memory", "1g")
                executor_cores = spark_submit_config.get("executor-cores", 1)

                if deploy_mode not in ["client"]:
                    raise ValueError(
                        f"deploy mode {deploy_mode} not supported")
                spark_home = os.environ["SPARK_HOME"]
                spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit")
                process_cmd = [
                    spark_submit_cmd,
                    f'--name={task_id}#{role}',
                    f'--deploy-mode={deploy_mode}',
                    f'--queue={queue}',
                    f'--driver-memory={driver_memory}',
                    f'--num-executors={num_executors}',
                    f'--executor-memory={executor_memory}',
                    f'--executor-cores={executor_cores}',
                    sys.modules[TaskExecutor.__module__].__file__,
                    '-j',
                    job_id,
                    '-n',
                    component_name,
                    '-t',
                    task_id,
                    '-r',
                    role,
                    '-p',
                    party_id,
                    '-c',
                    task_config_path,
                    '--job_server',
                    '{}:{}'.format(task_config['job_server']['ip'],
                                   task_config['job_server']['http_port']),
                ]
            else:
                raise ValueError(f"${backend} supported")

            task_log_dir = os.path.join(
                job_utils.get_job_log_directory(job_id=job_id), role, party_id,
                component_name)
            schedule_logger(job_id).info(
                'job {} {} {} {} task subprocess start'.format(
                    job_id, component_name, role, party_id, task_config))
            p = job_utils.run_subprocess(config_dir=task_dir,
                                         process_cmd=process_cmd,
                                         log_dir=task_log_dir)
            if p:
                task_process_start_status = True
        except Exception as e:
            schedule_logger(job_id).exception(e)
        finally:
            schedule_logger(job_id).info(
                'job {} component {} on {} {} start task subprocess {}'.format(
                    job_id, component_name, role, party_id,
                    'success' if task_process_start_status else 'failed'))