Example #1
0
def _submit_and_await_termination(
    client: skein.Client,
    spec: skein.ApplicationSpec,
    tasks: typing.List[str],
    poll_every_secs: int = 10
):
    app = client.submit_and_connect(spec)
    events: typing.Dict[str, typing.Dict[str, str]] = {task: {} for task in tasks}
    event_listener = Thread(target=_aggregate_events, args=(app.kv, events))
    event_listener.start()
    with _shutdown_on_exception(app):
        state = None
        while True:
            report = client.application_report(app.id)
            logger.info(
                f"Application report for {app.id} (state: {report.state})")
            if state != report.state:
                logger.info(_format_app_report(report))

            if report.final_status != "undefined":
                event_listener.join()
                logger.info(_format_run_summary(events))
                if report.final_status == "failed":
                    raise RunFailed
                else:
                    break

            time.sleep(poll_every_secs)
            state = report.state
Example #2
0
def _setup_skein_cluster(pyenvs: Dict[NodeLabel, PythonEnvDescription],
                         task_specs: Dict[str, TaskSpec] = TASK_SPEC_NONE,
                         *,
                         skein_client: skein.Client = None,
                         files: Dict[str, str] = None,
                         env: Dict[str, str] = {},
                         queue: str = "default",
                         acls: ACLs = None,
                         file_systems: List[str] = None,
                         log_conf_file: str = None,
                         standalone_client_mode: bool = False) -> SkeinCluster:
    os.environ["JAVA_TOOL_OPTIONS"] = \
        "-XX:ParallelGCThreads=1 -XX:CICompilerCount=2 "\
        f"{os.environ.get('JAVA_TOOL_OPTIONS', '')}"

    with tempfile.TemporaryDirectory() as tempdir:
        task_files, task_env = _setup_task_env(tempdir, files, env)
        services = {}
        for task_type, task_spec in list(task_specs.items()):
            pyenv = pyenvs[task_spec.label]
            service_env = task_env.copy()
            if task_spec.termination_timeout_seconds >= 0:
                _add_to_env(service_env, "SERVICE_TERMINATION_TIMEOUT_SECONDS",
                            str(task_spec.termination_timeout_seconds))
            services[task_type] = skein.Service(
                script=gen_task_cmd(pyenv, log_conf_file),
                resources=skein.model.Resources(task_spec.memory,
                                                task_spec.vcores),
                max_restarts=0,
                instances=task_spec.instances,
                node_label=task_spec.label.value,
                files={
                    **task_files, pyenv.dest_path: pyenv.path_to_archive
                },
                env=service_env)

        spec = skein.ApplicationSpec(services,
                                     queue=queue,
                                     acls=acls,
                                     file_systems=file_systems)

        if skein_client is None:
            skein_client = skein.Client()

        task_instances = [(task_type, spec.instances)
                          for task_type, spec in task_specs.items()]
        events: Dict[str, Dict[str, str]] = \
            {task: {} for task in iter_tasks(task_instances)}
        app = skein_client.submit_and_connect(spec)
        # Start a thread which collects all events posted by all tasks in kv store
        event_listener = Thread(target=_aggregate_events,
                                args=(app.kv, events))
        event_listener.start()

        cluster_spec = _setup_cluster_tasks(task_instances, app,
                                            standalone_client_mode)

        return SkeinCluster(skein_client, app, task_instances, cluster_spec,
                            event_listener, events)
Example #3
0
def wait_for_finished(client: skein.Client, app_id: str):
    logger.info(f"application_id: {app_id}")
    while True:
        report = client.application_report(app_id)

        logger.info(report)

        if report.final_status != "undefined":
            logger.info(report.final_status)
            break

        time.sleep(3)
Example #4
0
def _get_app_logs(
        client: skein.Client, app: skein.ApplicationClient,
        wait_for_nb_logs: int) -> Optional[skein.model.ApplicationLogs]:
    for ind in range(YARN_LOG_TRIES):
        try:
            logs = client.application_logs(app.id)
            nb_keys = len(logs.keys())
            logger.info(f"Got {nb_keys}/{wait_for_nb_logs} log files")
            if nb_keys == wait_for_nb_logs:
                return logs
        except Exception:
            logger.warn(
                f"Cannot collect logs (attempt {ind+1}/{YARN_LOG_TRIES})",
                exc_info=True)
        time.sleep(3)
    return None
Example #5
0
def get_application_logs(
        client: skein.Client,
        app_id: str,
        wait_for_nb_logs: Optional[int] = None,
        log_tries: int = 15) -> Optional[skein.model.ApplicationLogs]:
    for ind in range(log_tries):
        try:
            logs = client.application_logs(app_id)
            nb_keys = len(logs.keys())
            logger.info(f"Got {nb_keys}/{wait_for_nb_logs} log files")
            if not wait_for_nb_logs or nb_keys == wait_for_nb_logs:
                return logs
        except Exception:
            logger.warning(
                f"Cannot collect logs (attempt {ind+1}/{log_tries})")
        time.sleep(3)
    return None
Example #6
0
def _setup_skein_cluster(
        pyenvs: Dict[topologies.NodeLabel, _env.PythonEnvDescription],
        task_specs: Dict[str, topologies.TaskSpec] = TASK_SPEC_NONE,
        *,
        custom_task_module: Optional[str] = None,
        skein_client: skein.Client = None,
        files: Dict[str, str] = None,
        env: Dict[str, str] = {},
        queue: str = "default",
        acls: ACLs = None,
        file_systems: List[str] = None,
        name: str = "RunOnYarn",
        n_try: int = 0,
        pre_script_hook: Optional[str] = None
) -> SkeinCluster:
    os.environ["JAVA_TOOL_OPTIONS"] = \
        "-XX:ParallelGCThreads=1 -XX:CICompilerCount=2 "\
        f"{os.environ.get('JAVA_TOOL_OPTIONS', '')}"

    pre_script_hook = pre_script_hook if pre_script_hook else ""
    with tempfile.TemporaryDirectory() as tempdir:
        task_files, task_env = _setup_task_env(tempdir, files, env, n_try)
        services = {}
        for task_type, task_spec in list(task_specs.items()):
            pyenv = pyenvs[task_spec.label]
            service_env = task_env.copy()
            if task_spec.tb_termination_timeout_seconds >= 0:
                service_env["TB_TERMINATION_TIMEOUT_SECONDS"] = \
                    str(task_spec.tb_termination_timeout_seconds)
            if task_spec.tb_model_dir:
                service_env["TB_MODEL_DIR"] = str(task_spec.tb_model_dir)
            if task_spec.tb_extra_args:
                service_env["TB_EXTRA_ARGS"] = str(task_spec.tb_extra_args)

            services[task_type] = skein.Service(
                script=f'''
                            set -x
                            {pre_script_hook}
                            {_env.gen_task_cmd(
                                pyenv,
                                task_type,
                                custom_task_module)}
                        ''',
                resources=skein.model.Resources(task_spec.memory, task_spec.vcores),
                max_restarts=0,
                instances=task_spec.instances,
                node_label=task_spec.label.value,
                files={
                    **task_files,
                    pyenv.dest_path: pyenv.path_to_archive
                },
                env=service_env)

        # on the cluster we don't ask again for delegation tokens
        if "HADOOP_TOKEN_FILE_LOCATION" in os.environ:
            file_systems = None

        spec = skein.ApplicationSpec(
            services,
            queue=queue,
            acls=acls,
            file_systems=file_systems,
            name=name
        )

        if skein_client is None:
            skein_client = skein.Client()

        task_instances = [(task_type, spec.instances) for task_type, spec in task_specs.items()]
        events: Dict[str, Dict[str, str]] = \
            {task: {} for task in _internal.iter_tasks(task_instances)}
        app = skein_client.submit_and_connect(spec)

        # Start a thread which collects all events posted by all tasks in kv store
        event_listener = Thread(target=_aggregate_events, args=(app.kv, events))
        event_listener.start()

        return SkeinCluster(skein_client, app, task_instances, event_listener, events)
Example #7
0
def submit(
    skein_client: skein.Client,
    module_name: str,
    additional_files: Optional[List[str]] = None,
    archive_hdfs: Optional[str] = None,
    args: Optional[List[str]] = None,
    env_vars: Optional[Dict[str, str]] = None,
    hadoop_file_systems: Tuple[str, ...] = (),
    max_attempts: int = 1,
    max_restarts: int = 0,
    memory: str = "1 GiB",
    name: str = "yarn_launcher",
    node_label: Optional[str] = None,
    num_containers: int = 1,
    num_cores: int = 1,
    pre_script_hook: Optional[str] = None,
    queue: Optional[str] = None,
    user: Optional[str] = None,
) -> str:
    """Submit application via skein."""
    with tempfile.TemporaryDirectory() as tmp_dir:
        # Update Environment Variables and script hook
        env = dict(env_vars) if env_vars else dict()
        pre_script_hook = pre_script_hook if pre_script_hook else ""
        env.update({"SKEIN_CONFIG": "./.skein", "GIT_PYTHON_REFRESH": "quiet"})

        # Create Skein Config, Service and Spec
        skein_config = skein_config_builder.build(
            module_name,
            args=args if args else [],
            package_path=archive_hdfs,
            additional_files=additional_files,
            tmp_dir=tmp_dir,
        )
        skein_service = skein.Service(
            resources=skein.model.Resources(memory, num_cores),
            instances=num_containers,
            files=skein_config.files,
            env=env,
            script=f"""
                        set -x
                        env
                        {pre_script_hook}
                        {skein_config.script}
                    """,
            max_restarts=max_restarts,
        )
        skein_spec = skein.ApplicationSpec(
            name=name,
            file_systems=list(hadoop_file_systems),
            services={name: skein_service},
            acls=skein.model.ACLs(enable=True,
                                  ui_users=["*"],
                                  view_users=["*"]),
            max_attempts=max_attempts,
        )

        # Activate impersonation only if user to run the job is not
        # the current user (yarn issue)
        if user and user != getpass.getuser():
            skein_spec.user = user
        if queue:
            skein_spec.queue = queue
        if node_label:
            skein_service.node_label = node_label
        return skein_client.submit(skein_spec)