def _submit_and_await_termination( client: skein.Client, spec: skein.ApplicationSpec, tasks: typing.List[str], poll_every_secs: int = 10 ): app = client.submit_and_connect(spec) events: typing.Dict[str, typing.Dict[str, str]] = {task: {} for task in tasks} event_listener = Thread(target=_aggregate_events, args=(app.kv, events)) event_listener.start() with _shutdown_on_exception(app): state = None while True: report = client.application_report(app.id) logger.info( f"Application report for {app.id} (state: {report.state})") if state != report.state: logger.info(_format_app_report(report)) if report.final_status != "undefined": event_listener.join() logger.info(_format_run_summary(events)) if report.final_status == "failed": raise RunFailed else: break time.sleep(poll_every_secs) state = report.state
def _setup_skein_cluster(pyenvs: Dict[NodeLabel, PythonEnvDescription], task_specs: Dict[str, TaskSpec] = TASK_SPEC_NONE, *, skein_client: skein.Client = None, files: Dict[str, str] = None, env: Dict[str, str] = {}, queue: str = "default", acls: ACLs = None, file_systems: List[str] = None, log_conf_file: str = None, standalone_client_mode: bool = False) -> SkeinCluster: os.environ["JAVA_TOOL_OPTIONS"] = \ "-XX:ParallelGCThreads=1 -XX:CICompilerCount=2 "\ f"{os.environ.get('JAVA_TOOL_OPTIONS', '')}" with tempfile.TemporaryDirectory() as tempdir: task_files, task_env = _setup_task_env(tempdir, files, env) services = {} for task_type, task_spec in list(task_specs.items()): pyenv = pyenvs[task_spec.label] service_env = task_env.copy() if task_spec.termination_timeout_seconds >= 0: _add_to_env(service_env, "SERVICE_TERMINATION_TIMEOUT_SECONDS", str(task_spec.termination_timeout_seconds)) services[task_type] = skein.Service( script=gen_task_cmd(pyenv, log_conf_file), resources=skein.model.Resources(task_spec.memory, task_spec.vcores), max_restarts=0, instances=task_spec.instances, node_label=task_spec.label.value, files={ **task_files, pyenv.dest_path: pyenv.path_to_archive }, env=service_env) spec = skein.ApplicationSpec(services, queue=queue, acls=acls, file_systems=file_systems) if skein_client is None: skein_client = skein.Client() task_instances = [(task_type, spec.instances) for task_type, spec in task_specs.items()] events: Dict[str, Dict[str, str]] = \ {task: {} for task in iter_tasks(task_instances)} app = skein_client.submit_and_connect(spec) # Start a thread which collects all events posted by all tasks in kv store event_listener = Thread(target=_aggregate_events, args=(app.kv, events)) event_listener.start() cluster_spec = _setup_cluster_tasks(task_instances, app, standalone_client_mode) return SkeinCluster(skein_client, app, task_instances, cluster_spec, event_listener, events)
def wait_for_finished(client: skein.Client, app_id: str): logger.info(f"application_id: {app_id}") while True: report = client.application_report(app_id) logger.info(report) if report.final_status != "undefined": logger.info(report.final_status) break time.sleep(3)
def _get_app_logs( client: skein.Client, app: skein.ApplicationClient, wait_for_nb_logs: int) -> Optional[skein.model.ApplicationLogs]: for ind in range(YARN_LOG_TRIES): try: logs = client.application_logs(app.id) nb_keys = len(logs.keys()) logger.info(f"Got {nb_keys}/{wait_for_nb_logs} log files") if nb_keys == wait_for_nb_logs: return logs except Exception: logger.warn( f"Cannot collect logs (attempt {ind+1}/{YARN_LOG_TRIES})", exc_info=True) time.sleep(3) return None
def get_application_logs( client: skein.Client, app_id: str, wait_for_nb_logs: Optional[int] = None, log_tries: int = 15) -> Optional[skein.model.ApplicationLogs]: for ind in range(log_tries): try: logs = client.application_logs(app_id) nb_keys = len(logs.keys()) logger.info(f"Got {nb_keys}/{wait_for_nb_logs} log files") if not wait_for_nb_logs or nb_keys == wait_for_nb_logs: return logs except Exception: logger.warning( f"Cannot collect logs (attempt {ind+1}/{log_tries})") time.sleep(3) return None
def _setup_skein_cluster( pyenvs: Dict[topologies.NodeLabel, _env.PythonEnvDescription], task_specs: Dict[str, topologies.TaskSpec] = TASK_SPEC_NONE, *, custom_task_module: Optional[str] = None, skein_client: skein.Client = None, files: Dict[str, str] = None, env: Dict[str, str] = {}, queue: str = "default", acls: ACLs = None, file_systems: List[str] = None, name: str = "RunOnYarn", n_try: int = 0, pre_script_hook: Optional[str] = None ) -> SkeinCluster: os.environ["JAVA_TOOL_OPTIONS"] = \ "-XX:ParallelGCThreads=1 -XX:CICompilerCount=2 "\ f"{os.environ.get('JAVA_TOOL_OPTIONS', '')}" pre_script_hook = pre_script_hook if pre_script_hook else "" with tempfile.TemporaryDirectory() as tempdir: task_files, task_env = _setup_task_env(tempdir, files, env, n_try) services = {} for task_type, task_spec in list(task_specs.items()): pyenv = pyenvs[task_spec.label] service_env = task_env.copy() if task_spec.tb_termination_timeout_seconds >= 0: service_env["TB_TERMINATION_TIMEOUT_SECONDS"] = \ str(task_spec.tb_termination_timeout_seconds) if task_spec.tb_model_dir: service_env["TB_MODEL_DIR"] = str(task_spec.tb_model_dir) if task_spec.tb_extra_args: service_env["TB_EXTRA_ARGS"] = str(task_spec.tb_extra_args) services[task_type] = skein.Service( script=f''' set -x {pre_script_hook} {_env.gen_task_cmd( pyenv, task_type, custom_task_module)} ''', resources=skein.model.Resources(task_spec.memory, task_spec.vcores), max_restarts=0, instances=task_spec.instances, node_label=task_spec.label.value, files={ **task_files, pyenv.dest_path: pyenv.path_to_archive }, env=service_env) # on the cluster we don't ask again for delegation tokens if "HADOOP_TOKEN_FILE_LOCATION" in os.environ: file_systems = None spec = skein.ApplicationSpec( services, queue=queue, acls=acls, file_systems=file_systems, name=name ) if skein_client is None: skein_client = skein.Client() task_instances = [(task_type, spec.instances) for task_type, spec in task_specs.items()] events: Dict[str, Dict[str, str]] = \ {task: {} for task in _internal.iter_tasks(task_instances)} app = skein_client.submit_and_connect(spec) # Start a thread which collects all events posted by all tasks in kv store event_listener = Thread(target=_aggregate_events, args=(app.kv, events)) event_listener.start() return SkeinCluster(skein_client, app, task_instances, event_listener, events)
def submit( skein_client: skein.Client, module_name: str, additional_files: Optional[List[str]] = None, archive_hdfs: Optional[str] = None, args: Optional[List[str]] = None, env_vars: Optional[Dict[str, str]] = None, hadoop_file_systems: Tuple[str, ...] = (), max_attempts: int = 1, max_restarts: int = 0, memory: str = "1 GiB", name: str = "yarn_launcher", node_label: Optional[str] = None, num_containers: int = 1, num_cores: int = 1, pre_script_hook: Optional[str] = None, queue: Optional[str] = None, user: Optional[str] = None, ) -> str: """Submit application via skein.""" with tempfile.TemporaryDirectory() as tmp_dir: # Update Environment Variables and script hook env = dict(env_vars) if env_vars else dict() pre_script_hook = pre_script_hook if pre_script_hook else "" env.update({"SKEIN_CONFIG": "./.skein", "GIT_PYTHON_REFRESH": "quiet"}) # Create Skein Config, Service and Spec skein_config = skein_config_builder.build( module_name, args=args if args else [], package_path=archive_hdfs, additional_files=additional_files, tmp_dir=tmp_dir, ) skein_service = skein.Service( resources=skein.model.Resources(memory, num_cores), instances=num_containers, files=skein_config.files, env=env, script=f""" set -x env {pre_script_hook} {skein_config.script} """, max_restarts=max_restarts, ) skein_spec = skein.ApplicationSpec( name=name, file_systems=list(hadoop_file_systems), services={name: skein_service}, acls=skein.model.ACLs(enable=True, ui_users=["*"], view_users=["*"]), max_attempts=max_attempts, ) # Activate impersonation only if user to run the job is not # the current user (yarn issue) if user and user != getpass.getuser(): skein_spec.user = user if queue: skein_spec.queue = queue if node_label: skein_service.node_label = node_label return skein_client.submit(skein_spec)