Beispiel #1
0
def test_client_login_from_keytab(security, not_logged_in):
    with skein.Client(principal='testuser',
                      keytab=KEYTAB_PATH,
                      security=security) as client:
        # login worked
        client.get_applications()

    # Improper principal/keytab pair
    with pytest.raises(skein.DriverError):
        skein.Client(principal='not_the_right_user',
                     keytab=KEYTAB_PATH,
                     security=security)

    # Keytab file missing
    with pytest.raises(FileNotFoundError):
        skein.Client(principal='testuser',
                     keytab='/not/a/real/path',
                     security=security)

    # Must specify both principal and keytab
    with pytest.raises(ValueError):
        skein.Client(principal='testuser', security=security)

    with pytest.raises(ValueError):
        skein.Client(keytab=KEYTAB_PATH, security=security)
Beispiel #2
0
def test_client(security, kinit, tmpdir):
    logpath = str(tmpdir.join("log.txt"))

    with skein.Client(security=security, log=logpath) as client:
        # smoketests
        client.get_applications()
        repr(client)

        client2 = skein.Client(address=client.address, security=security)
        assert client2._proc is None

        # smoketests
        client2.get_applications()
        repr(client2)

    # Process was definitely closed
    assert not pid_exists(client._proc.pid)

    # no-op to call close again
    client.close()

    # Log was written
    assert os.path.exists(logpath)
    with open(logpath) as fil:
        assert len(fil.read()) > 0

    # Connection error on closed client
    with pytest.raises(skein.ConnectionError):
        client2.get_applications()

    # Connection error on connecting to missing daemon
    with pytest.raises(skein.ConnectionError):
        skein.Client(address=client.address, security=security)
Beispiel #3
0
async def skein_client(principal=None, keytab=None):
    """Return a shared skein client object.

    Calls with the same principal & keytab will return the same client object
    (if one exists).
    """
    key = (principal, keytab)
    client = _skein_client_cache.get(key)
    if client is None:
        kwargs = dict(
            principal=principal,
            keytab=keytab,
            security=skein.Security.new_credentials(),
        )
        fut = get_running_loop().run_in_executor(
            None, lambda: skein.Client(**kwargs))
        # Save the future first so any concurrent calls will wait on the same
        # future for generating the client
        _skein_client_cache[key] = fut
        client = await fut
        # Replace the future now that the operation is done
        _skein_client_cache[key] = client
    elif asyncio.isfuture(client):
        client = await client
    return client
def launch_remote_check(file: str) -> Tuple[bool, str]:
    logging.info('Launching remote check')
    zip_hdfs, _ = cluster_pack.upload_env(packer=cluster_pack.PEX_PACKER)
    archive_name = os.path.basename(zip_hdfs)
    with skein.Client() as client:
        files = {
            archive_name: zip_hdfs,
            'check_hadoop_env.py': __file__,
        }
        editable_packages = cluster_pack.get_editable_requirements()
        if 'tf_yarn' in editable_packages:
            tf_yarn_zip = cluster_pack.zip_path(editable_packages['tf_yarn'],
                                                False)
            logger.info(f"zip path for editable tf_yarn is {tf_yarn_zip}")
            files.update({'tf_yarn': tf_yarn_zip})
        service = skein.Service(
            script=f'./{archive_name} check_hadoop_env.py --file {file}',
            resources=skein.Resources(2 * 1024, 1),
            env={
                'PEX_ROOT': '/tmp/{uuid.uuid4()}/',
                'PYTHONPATH': '.:',
            },
            files=files,
            instances=1)
        spec = skein.ApplicationSpec(
            {'HADOOP_ENV_CHECKER': service},
            acls=skein.model.ACLs(enable=True, view_users=['*']),
        )
        app = client.submit_and_connect(spec)

        logging.info('Remote check started')
        result = app.kv.wait('result').decode()
        app_id = app.id
        app.shutdown()
        return result == "True", app_id
Beispiel #5
0
def test_client_errors_nicely_if_not_logged_in(security, not_logged_in):
    appid = 'application_1526134340424_0012'

    spec = skein.ApplicationSpec(name="should_never_get_to_run",
                                 queue="default",
                                 services={
                                     'service':
                                     skein.Service(resources=skein.Resources(
                                         memory=32, vcores=1),
                                                   script='env')
                                 })

    with skein.Client(security=security) as client:
        for func, args in [('get_applications', ()), ('get_nodes', ()),
                           ('get_queue', ('default', )),
                           ('get_child_queues', ('default', )),
                           ('get_all_queues', ()),
                           ('application_report', (appid, )),
                           ('connect', (appid, )),
                           ('move_application', (appid, 'default')),
                           ('kill_application', (appid, )),
                           ('submit', (spec, ))]:
            with pytest.raises(skein.DriverError) as exc:
                getattr(client, func)(*args)
            assert 'kinit' in str(exc.value)
Beispiel #6
0
    def run(self):
        if self.run_on_yarn:
            # Dump job and base as local json files for yarn_launcher
            job_name = f"job-{self.config.name}.json"
            with open(job_name, "w") as file:
                json.dump(self.job, file, indent=4)

            # Launch job on yarn
            pex_path = self.config.upload_pex_cpu()
            with skein.Client() as skein_client:
                LOGGER.info(f"Submitting job {self.config.name}")
                app_id = submit(
                    skein_client=skein_client,
                    module_name="deepr.cli.main",
                    additional_files=[job_name],
                    archive_hdfs=pex_path,
                    args=["from_config", job_name, "-", "run"],
                    env_vars=self.config.get_env_vars(),
                    hadoop_file_systems=self.config.hadoop_file_systems,
                    memory=self.config.memory,
                    name=self.config.name,
                    num_cores=self.config.num_cores,
                )
                report = skein_client.application_report(app_id)
                LOGGER.info(f"TRACKING_URL: {report.tracking_url}")
            mlflow.clear_run()
        else:
            LOGGER.info("Not running on yarn.")
            job = from_config(self.job)
            job.run()
Beispiel #7
0
def _get_skein_client(skein_client=None, security=None):
    if skein_client is None:
        # Silence warning about credentials not being written yet
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            return skein.Client(security=security)
    return skein_client
Beispiel #8
0
def _setup_skein_cluster(pyenvs: Dict[NodeLabel, PythonEnvDescription],
                         task_specs: Dict[str, TaskSpec] = TASK_SPEC_NONE,
                         *,
                         skein_client: skein.Client = None,
                         files: Dict[str, str] = None,
                         env: Dict[str, str] = {},
                         queue: str = "default",
                         acls: ACLs = None,
                         file_systems: List[str] = None,
                         log_conf_file: str = None,
                         standalone_client_mode: bool = False) -> SkeinCluster:
    os.environ["JAVA_TOOL_OPTIONS"] = \
        "-XX:ParallelGCThreads=1 -XX:CICompilerCount=2 "\
        f"{os.environ.get('JAVA_TOOL_OPTIONS', '')}"

    with tempfile.TemporaryDirectory() as tempdir:
        task_files, task_env = _setup_task_env(tempdir, files, env)
        services = {}
        for task_type, task_spec in list(task_specs.items()):
            pyenv = pyenvs[task_spec.label]
            service_env = task_env.copy()
            if task_spec.termination_timeout_seconds >= 0:
                _add_to_env(service_env, "SERVICE_TERMINATION_TIMEOUT_SECONDS",
                            str(task_spec.termination_timeout_seconds))
            services[task_type] = skein.Service(
                script=gen_task_cmd(pyenv, log_conf_file),
                resources=skein.model.Resources(task_spec.memory,
                                                task_spec.vcores),
                max_restarts=0,
                instances=task_spec.instances,
                node_label=task_spec.label.value,
                files={
                    **task_files, pyenv.dest_path: pyenv.path_to_archive
                },
                env=service_env)

        spec = skein.ApplicationSpec(services,
                                     queue=queue,
                                     acls=acls,
                                     file_systems=file_systems)

        if skein_client is None:
            skein_client = skein.Client()

        task_instances = [(task_type, spec.instances)
                          for task_type, spec in task_specs.items()]
        events: Dict[str, Dict[str, str]] = \
            {task: {} for task in iter_tasks(task_instances)}
        app = skein_client.submit_and_connect(spec)
        # Start a thread which collects all events posted by all tasks in kv store
        event_listener = Thread(target=_aggregate_events,
                                args=(app.kv, events))
        event_listener.start()

        cluster_spec = _setup_cluster_tasks(task_instances, app,
                                            standalone_client_mode)

        return SkeinCluster(skein_client, app, task_instances, cluster_spec,
                            event_listener, events)
Beispiel #9
0
def main():
    tf.logging.set_verbosity(tf.logging.DEBUG)
    print(sys.argv)
    if '--server' in sys.argv:
        create_cluster()
    else:
        with skein.Client() as client:
            client_tf(client)
Beispiel #10
0
def _check_merged_logs(app_id, key_word, result_status):
    with skein.Client() as client:
        logs = skein_launcher.get_application_logs(client, app_id, 2)
        merged_logs = ""
        for key, value in logs.items():
            merged_logs += value
            _logger.info(f"logs:{key} {value}")
        assert result_status
        assert key_word in merged_logs
Beispiel #11
0
def test_client_closed_when_reference_dropped(security, kinit):
    client = skein.Client(security=security, log=False)
    ref = weakref.ref(client)

    pid = client._proc.pid

    del client
    assert ref() is None
    assert not pid_exists(pid)
Beispiel #12
0
 def stop(self, status='SUCCEEDED'):
     import skein
     try:
         skein_client = skein.Client()
         app_client = skein_client.connect(self._application_id)
         app_client.shutdown(status=status)
         if self._is_client_managed:
             self._skein_client.close()
     except skein.ApplicationNotRunningError:
         pass
Beispiel #13
0
def test_client_set_log_level(security, kinit, tmpdir):
    logpath = str(tmpdir.join("log.txt"))

    with skein.Client(security=security, log=logpath, log_level='debug') as client:
        # do an operation to ensure everything is working
        client.get_applications()

    with open(logpath) as fil:
        data = fil.read()
        assert 'DEBUG' in data
Beispiel #14
0
 async def _get_client(self):
     key = (self.principal, self.keytab)
     client = type(self).clients.get(key)
     if client is None:
         kwargs = dict(principal=self.principal,
                       keytab=self.keytab,
                       security=skein.Security.new_credentials())
         client = await gen.IOLoop.current().run_in_executor(
             None, lambda: skein.Client(**kwargs))
         type(self).clients[key] = client
     return client
Beispiel #15
0
def test_client_starts_without_java_home(monkeypatch, tmpdir, security, kinit):
    monkeypatch.delenv('JAVA_HOME', raising=False)

    logpath = str(tmpdir.join("log.txt"))

    with skein.Client(security=security, log=logpath) as client:
        # do an operation to ensure everything is working
        client.get_applications()

    with open(logpath) as fil:
        data = fil.read()
        assert 'WARN' not in data
        assert 'native-hadoop' not in data
 def launch_skein():
     with skein.Client() as client:
         service = skein.Service(
             resources=skein.model.Resources("1 GiB", 1),
             script=f'''
                 set -x
                 hdfs dfs -cat {filepath_on_hdfs}
             '''
         )
         spec = skein.ApplicationSpec(services={"service": service})
         app_id = client.submit(spec)
         skein_launcher.wait_for_finished(client, app_id)
         logs = skein_launcher.get_application_logs(client, app_id, 2)
         for key, value in logs.items():
             print(f"skein logs:{key} {value}")
Beispiel #17
0
def test_client_forward_java_options(use_env, security, kinit, tmpdir, monkeypatch):
    logpath = str(tmpdir.join("log.txt"))

    if use_env:
        monkeypatch.setenv('SKEIN_DRIVER_JAVA_OPTIONS',
                           '-Dskein.log.level=debug')
        kwargs = {}
    else:
        kwargs = {'java_options': ['-Dskein.log.level=debug']}

    with skein.Client(security=security, log=logpath, **kwargs) as client:
        # do an operation to ensure everything is working
        client.get_applications()

    with open(logpath) as fil:
        data = fil.read()
        assert 'DEBUG' in data
def _submit_and_await_app_master(func, assert_result_status=True, assert_log_content=None):
    with skein.Client() as client:
        log_output_path = f"hdfs:///tmp/{uuid.uuid4()}.log"
        app_id = skein_launcher.submit_func(
            client,
            func=func,
            args=[],
            memory="2 GiB",
            process_logs=functools.partial(skein_launcher.upload_logs_to_hdfs, log_output_path))
        result = skein_launcher.wait_for_finished(client, app_id)

        fs, _ = filesystem.resolve_filesystem_and_path(log_output_path)
        with fs.open(log_output_path, "rb") as f:
            logs = f.read().decode()
            assert result == assert_result_status
            _logger.info(f"appmaster logs:\n{logs}")
            assert assert_log_content in logs
Beispiel #19
0
    def _start_cluster(self, spec, skein_client=None):
        """Start the cluster and initialize state"""
        if skein_client is None:
            skein_client = skein.Client()

        app = skein_client.submit_and_connect(spec)
        try:
            scheduler_address = app.kv.wait('dask.scheduler').decode()
        except BaseException:
            # Failed to connect, kill the application and reraise
            skein_client.kill_application(app.id)
            raise

        # Ensure application gets cleaned up
        self._finalizer = weakref.finalize(self, app.shutdown)

        self.app_id = app.id
        self.application_client = app
        self.scheduler_address = scheduler_address
Beispiel #20
0
def status(app_id):
    report = skein.Client().application_report(app_id)
    header = ['application_id',
              'name',
              'state',
              'status',
              'containers',
              'vcores',
              'memory',
              'runtime']
    data = [(report.id,
             report.name,
             report.state,
             report.final_status,
             report.usage.num_used_containers,
             report.usage.used_resources.vcores,
             report.usage.used_resources.memory,
             humanize_timedelta(report.runtime))]
    print(format_table(header, data))
Beispiel #21
0
    def from_application_id(cls, app_id, skein_client=None):
        """Connect to an existing ``YarnCluster`` with a given application id.

        Parameters
        ----------
        app_id : str
            The existing cluster's application id.
        skein_client : skein.Client
            The ``skein.Client`` to use. If not provided, one will be started.

        Returns
        -------
        YarnCluster
        """
        self = super(YarnCluster, cls).__new__(cls)
        if skein_client is None:
            skein_client = skein.Client()
        app = skein_client.connect(app_id)
        self._connect_existing(app)
        return self
Beispiel #22
0
from cluster_pack.skein import skein_config_builder, skein_launcher


if __name__ == "__main__":

    logging.basicConfig(level="INFO")

    package_path, _ = cluster_pack.upload_env()

    with tempfile.TemporaryDirectory() as tmp_dir:
        skein_config = skein_config_builder.build(
            module_name="skein_project.worker",
            package_path=package_path,
            tmp_dir=tmp_dir
        )

        with skein.Client() as client:
            service = skein.Service(
                resources=skein.model.Resources("1 GiB", 1),
                files=skein_config.files,
                script=skein_config.script
            )
            spec = skein.ApplicationSpec(services={"service": service})
            app_id = client.submit(spec)

            skein_launcher.wait_for_finished(client, app_id)
            logs = skein_launcher.get_application_logs(client, app_id, 2)
            if logs:
                for key, value in logs.items():
                    print(f"skein logs:{key} {value}")
Beispiel #23
0
def new_cluster(environment=None,
                scheduler_num=1,
                scheduler_cpu=None,
                scheduler_mem=None,
                worker_num=1,
                worker_cpu=None,
                worker_mem=None,
                worker_spill_paths=None,
                worker_cache_mem=None,
                min_worker_num=None,
                web_num=1,
                web_cpu=None,
                web_mem=None,
                timeout=None,
                log_config=None,
                skein_client=None,
                app_name=None,
                **kwargs):
    import skein
    from .web import YarnWebApplication

    def _override_envs(src, updates):
        ret = src.copy()
        ret.update(updates)
        return ret

    app_name = app_name or f'mars-app-{uuid.uuid4()}'

    log_when_fail = kwargs.pop('log_when_fail', False)

    scheduler_extra_modules = kwargs.pop('scheduler_extra_modules', None)
    worker_extra_modules = kwargs.pop('worker_extra_modules', None)
    web_extra_modules = kwargs.pop('web_extra_modules', None)

    cmd_tmpl = kwargs.pop('cmd_tmpl', None)

    extra_envs = kwargs.pop('extra_env', dict())
    scheduler_extra_env = _override_envs(
        extra_envs, kwargs.pop('scheduler_extra_env', dict()))
    worker_extra_env = _override_envs(extra_envs,
                                      kwargs.pop('worker_extra_env', dict()))
    web_extra_env = _override_envs(extra_envs,
                                   kwargs.pop('web_extra_env', dict()))

    extra_args = kwargs.pop('extra_args', '')
    scheduler_extra_args = (extra_args + ' ' +
                            kwargs.pop('scheduler_extra_args', '')).strip()
    worker_extra_args = (extra_args + ' ' +
                         kwargs.pop('worker_extra_args', '')).strip()
    web_extra_args = (extra_args + ' ' +
                      kwargs.pop('web_extra_args', '')).strip()

    scheduler_log_config = kwargs.pop('scheduler_log_config', log_config)
    worker_log_config = kwargs.pop('worker_log_config', log_config)
    web_log_config = kwargs.pop('web_log_config', log_config)

    scheduler_config = MarsSchedulerConfig(instances=scheduler_num,
                                           environment=environment,
                                           cpu=scheduler_cpu,
                                           memory=scheduler_mem,
                                           modules=scheduler_extra_modules,
                                           env=scheduler_extra_env,
                                           log_config=scheduler_log_config,
                                           extra_args=scheduler_extra_args,
                                           cmd_tmpl=cmd_tmpl)
    worker_config = MarsWorkerConfig(instances=worker_num,
                                     environment=environment,
                                     cpu=worker_cpu,
                                     memory=worker_mem,
                                     spill_dirs=worker_spill_paths,
                                     worker_cache_mem=worker_cache_mem,
                                     modules=worker_extra_modules,
                                     env=worker_extra_env,
                                     log_config=worker_log_config,
                                     extra_args=worker_extra_args,
                                     cmd_tmpl=cmd_tmpl)
    web_config = MarsWebConfig(instances=web_num,
                               environment=environment,
                               cpu=web_cpu,
                               memory=web_mem,
                               modules=web_extra_modules,
                               env=web_extra_env,
                               log_config=web_log_config,
                               extra_args=web_extra_args,
                               cmd_tmpl=cmd_tmpl)
    app_config = MarsApplicationConfig(app_name,
                                       scheduler_config=scheduler_config,
                                       worker_config=worker_config,
                                       web_config=web_config)

    skein_client = skein_client or skein.Client()
    app_id = None
    try:
        is_client_managed = skein_client is not None
        app_id = skein_client.submit(app_config.build())

        check_start_time = time.time()
        while True:
            try:
                app_client = skein_client.connect(app_id)
                break
            except skein.ApplicationNotRunningError:  # pragma: no cover
                time.sleep(0.5)
                if time.time() - check_start_time > timeout:
                    raise

        logger.debug('Application client for %s at %s retrieved', app_id,
                     app_client.address)

        # wait until schedulers and expected num of workers are ready
        min_worker_num = int(min_worker_num or worker_num)
        limits = [scheduler_num, min_worker_num, web_num]
        services = [
            MarsSchedulerConfig.service_name, MarsWorkerConfig.service_name,
            MarsWebConfig.service_name
        ]

        wait_services_ready(
            services,
            limits,
            lambda svc: _get_ready_container_count(app_client, svc),
            timeout=timeout - (time.time() - check_start_time))
        web_endpoint_kv = app_client.kv.get_prefix(
            YarnWebApplication.service_name)
        web_endpoint = random.choice(
            [to_str(v).split('@', 1)[0] for v in web_endpoint_kv.values()])
        return YarnClusterClient(skein_client,
                                 app_client.id,
                                 'http://' + web_endpoint,
                                 is_client_managed=is_client_managed)
    except:  # noqa: E722
        skein_client = skein.Client()
        try:
            if log_when_fail:
                if app_id is not None:
                    try:
                        app_client = skein_client.connect(app_id)
                        app_client.shutdown(status='FAILED')
                    except skein.ApplicationNotRunningError:
                        pass

                    try:
                        logs = skein_client.application_logs(app_id)
                        logger.error('Error when creating cluster:\n%s',
                                     logs.dumps())
                    except ValueError:
                        logger.error(
                            'Error when creating cluster and failed to get logs'
                        )
                else:
                    logger.error(
                        'Error when creating cluster and no logs from cluster')
        finally:
            if app_id is not None:
                skein_client.kill_application(app_id)
        raise
Beispiel #24
0
def skein_client(security, kinit):
    with skein.Client(security=security) as skein_client:
        yield skein_client
Beispiel #25
0
def _setup_skein_cluster(
        pyenvs: Dict[topologies.NodeLabel, _env.PythonEnvDescription],
        task_specs: Dict[str, topologies.TaskSpec] = TASK_SPEC_NONE,
        *,
        custom_task_module: Optional[str] = None,
        skein_client: skein.Client = None,
        files: Dict[str, str] = None,
        env: Dict[str, str] = {},
        queue: str = "default",
        acls: ACLs = None,
        file_systems: List[str] = None,
        name: str = "RunOnYarn",
        n_try: int = 0,
        pre_script_hook: Optional[str] = None
) -> SkeinCluster:
    os.environ["JAVA_TOOL_OPTIONS"] = \
        "-XX:ParallelGCThreads=1 -XX:CICompilerCount=2 "\
        f"{os.environ.get('JAVA_TOOL_OPTIONS', '')}"

    pre_script_hook = pre_script_hook if pre_script_hook else ""
    with tempfile.TemporaryDirectory() as tempdir:
        task_files, task_env = _setup_task_env(tempdir, files, env, n_try)
        services = {}
        for task_type, task_spec in list(task_specs.items()):
            pyenv = pyenvs[task_spec.label]
            service_env = task_env.copy()
            if task_spec.tb_termination_timeout_seconds >= 0:
                service_env["TB_TERMINATION_TIMEOUT_SECONDS"] = \
                    str(task_spec.tb_termination_timeout_seconds)
            if task_spec.tb_model_dir:
                service_env["TB_MODEL_DIR"] = str(task_spec.tb_model_dir)
            if task_spec.tb_extra_args:
                service_env["TB_EXTRA_ARGS"] = str(task_spec.tb_extra_args)

            services[task_type] = skein.Service(
                script=f'''
                            set -x
                            {pre_script_hook}
                            {_env.gen_task_cmd(
                                pyenv,
                                task_type,
                                custom_task_module)}
                        ''',
                resources=skein.model.Resources(task_spec.memory, task_spec.vcores),
                max_restarts=0,
                instances=task_spec.instances,
                node_label=task_spec.label.value,
                files={
                    **task_files,
                    pyenv.dest_path: pyenv.path_to_archive
                },
                env=service_env)

        # on the cluster we don't ask again for delegation tokens
        if "HADOOP_TOKEN_FILE_LOCATION" in os.environ:
            file_systems = None

        spec = skein.ApplicationSpec(
            services,
            queue=queue,
            acls=acls,
            file_systems=file_systems,
            name=name
        )

        if skein_client is None:
            skein_client = skein.Client()

        task_instances = [(task_type, spec.instances) for task_type, spec in task_specs.items()]
        events: Dict[str, Dict[str, str]] = \
            {task: {} for task in _internal.iter_tasks(task_instances)}
        app = skein_client.submit_and_connect(spec)

        # Start a thread which collects all events posted by all tasks in kv store
        event_listener = Thread(target=_aggregate_events, args=(app.kv, events))
        event_listener.start()

        return SkeinCluster(skein_client, app, task_instances, event_listener, events)
Beispiel #26
0
def skein_client():
    with skein.Client() as client:
        yield client
Beispiel #27
0
    print('Sent: %r' % message)

    data = await reader.read(100)
    print('Received: %r' % data.decode())

    writer.close()


async def echo_all(app, message):
    """Send and recieve a message from all running echo servers"""
    # Loop through all registered server addresses
    for address in app.kv.get_prefix('address.').values():
        # Parse the host and port from the stored address
        host, port = address.decode().split(':')
        port = int(port)

        # Send the message to the echo server
        await tcp_echo_client(message, loop, host, port)


# Get the application id from the command-line args
app_id = sys.argv[1]

# Connect to the application
app = skein.Client().connect(app_id)

# Send message to every running echo server
loop = asyncio.get_event_loop()
loop.run_until_complete(echo_all(app, 'Hello World!'))
loop.close()
Beispiel #28
0
def new_cluster(
    environment=None,
    supervisor_num=1,
    supervisor_cpu=None,
    supervisor_mem=None,
    worker_num=1,
    worker_cpu=None,
    worker_mem=None,
    worker_spill_paths=None,
    worker_cache_mem=None,
    min_worker_num=None,
    timeout=None,
    log_config=None,
    skein_client=None,
    app_name=None,
    app_queue=None,
    **kwargs,
):
    import skein
    from .supervisor import YarnSupervisorCommandRunner

    def _override_envs(src, updates):
        ret = src.copy()
        ret.update(updates)
        return ret

    if worker_cpu is None or worker_mem is None:  # pragma: no cover
        raise TypeError("`worker_cpu` and `worker_mem` must be specified")

    app_name = app_name or f"mars-app-{uuid.uuid4()}"
    supervisor_mem = calc_size_by_str(supervisor_mem, None)
    worker_mem = calc_size_by_str(worker_mem, None)

    log_when_fail = kwargs.pop("log_when_fail", False)

    supervisor_extra_modules = kwargs.pop("supervisor_extra_modules", None)
    worker_extra_modules = kwargs.pop("worker_extra_modules", None)

    cmd_tmpl = kwargs.pop("cmd_tmpl", None)

    extra_envs = kwargs.pop("extra_env", dict())
    supervisor_extra_env = _override_envs(
        extra_envs, kwargs.pop("supervisor_extra_env", dict()))
    worker_extra_env = _override_envs(extra_envs,
                                      kwargs.pop("worker_extra_env", dict()))

    extra_args = kwargs.pop("extra_args", "")
    supervisor_extra_args = (extra_args + " " +
                             kwargs.pop("supervisor_extra_args", "")).strip()
    worker_extra_args = (extra_args + " " +
                         kwargs.pop("worker_extra_args", "")).strip()

    supervisor_log_config = kwargs.pop("supervisor_log_config", log_config)
    worker_log_config = kwargs.pop("worker_log_config", log_config)

    supervisor_config = MarsSupervisorConfig(
        instances=supervisor_num,
        environment=environment,
        cpu=supervisor_cpu,
        memory=supervisor_mem,
        modules=supervisor_extra_modules,
        env=supervisor_extra_env,
        log_config=supervisor_log_config,
        extra_args=supervisor_extra_args,
        cmd_tmpl=cmd_tmpl,
    )
    worker_config = MarsWorkerConfig(
        instances=worker_num,
        environment=environment,
        cpu=worker_cpu,
        memory=worker_mem,
        spill_dirs=worker_spill_paths,
        worker_cache_mem=worker_cache_mem,
        modules=worker_extra_modules,
        env=worker_extra_env,
        log_config=worker_log_config,
        extra_args=worker_extra_args,
        cmd_tmpl=cmd_tmpl,
    )
    app_config = MarsApplicationConfig(
        app_name,
        app_queue,
        supervisor_config=supervisor_config,
        worker_config=worker_config,
    )

    skein_client = skein_client or skein.Client()
    app_id = None
    try:
        is_client_managed = skein_client is not None
        app_id = skein_client.submit(app_config.build())

        check_start_time = time.time()
        while True:
            try:
                app_client = skein_client.connect(app_id)
                break
            except skein.ApplicationNotRunningError:  # pragma: no cover
                time.sleep(0.5)
                if timeout and time.time() - check_start_time > timeout:
                    raise

        logger.debug("Application client for %s at %s retrieved", app_id,
                     app_client.address)

        # wait until supervisors and expected num of workers are ready
        min_worker_num = int(min_worker_num or worker_num)
        limits = [supervisor_num, min_worker_num]
        services = [
            MarsSupervisorConfig.service_name, MarsWorkerConfig.service_name
        ]

        wait_services_ready(
            services,
            limits,
            lambda svc: _get_ready_container_count(app_client, svc),
            timeout=None if not timeout else timeout -
            (time.time() - check_start_time),
        )
        web_endpoint_kv = app_client.kv.get_prefix(
            YarnSupervisorCommandRunner.web_service_name)
        web_endpoint = random.choice(
            [to_str(v).split("@", 1)[0] for v in web_endpoint_kv.values()])
        return YarnClusterClient(
            skein_client,
            app_client.id,
            web_endpoint,
            is_client_managed=is_client_managed,
        )
    except:  # noqa: E722
        skein_client = skein.Client()
        try:
            if log_when_fail:
                if app_id is not None:
                    try:
                        app_client = skein_client.connect(app_id)
                        app_client.shutdown(status="FAILED")
                    except skein.ApplicationNotRunningError:
                        pass

                    try:
                        logs = skein_client.application_logs(app_id)
                        logger.error("Error when creating cluster:\n%s",
                                     logs.dumps())
                    except ValueError:
                        logger.error(
                            "Error when creating cluster and failed to get logs"
                        )
                else:
                    logger.error(
                        "Error when creating cluster and no logs from cluster")
        finally:
            if app_id is not None:
                skein_client.kill_application(app_id)
        raise
Beispiel #29
0
    def setup_skein_cluster(self,
                            task_specs: Dict[str, TaskSpec] = TASK_SPEC_NONE,
                            *,
                            files: Dict[str, str] = None,
                            env: Dict[str, str] = {},
                            log_conf_file: str = None) -> SkeinCluster:
        """Request a cluster on YARN with Skein.

        The implementation allocates a service with the requested number
        of instances for each distributed TensorFlow task type. Each
        instance expects a serialized run_config to setup the tensorflow servers
        and an experiment function to execute.

        Parameters
        ----------
        task_specs
            Resources to allocate for each task type. The keys
            must be a subset of ``"chief"``, ``"worker"``, ``"ps"``, and
            ``"evaluator"``. The minimal spec must contain at least
            ``"chief"``.

        files
            Local files or directories to upload to the container.
            The keys are the target locations of the resources relative
            to the container root, while the values -- their
            corresponding local sources. Note that container root is
            appended to ``PYTHONPATH``. Therefore, any listed Python
            module a package is automatically importable.

        env
            Environment variables to forward to the containers.

        log_conf_file
            optional file with log config, setups logging by default with INFO verbosity,
            if you specify a file here don't forget to also ship it to the containers via files arg
        """
        os.environ["JAVA_TOOL_OPTIONS"] = \
            "-XX:ParallelGCThreads=1 -XX:CICompilerCount=2 "\
            f"{os.environ.get('JAVA_TOOL_OPTIONS', '')}"

        with tempfile.TemporaryDirectory() as tempdir:
            task_files, task_env = _setup_task_env(tempdir, files, env)
            services = {}
            for task_type, task_spec in list(task_specs.items()):
                pyenv = self.pyenvs[task_spec.label]
                service_env = task_env.copy()
                if task_spec.termination_timeout_seconds >= 0:
                    _add_to_env(service_env,
                                "SERVICE_TERMINATION_TIMEOUT_SECONDS",
                                str(task_spec.termination_timeout_seconds))
                services[task_type] = skein.Service(
                    commands=[gen_task_cmd(pyenv, log_conf_file)],
                    resources=skein.model.Resources(task_spec.memory,
                                                    task_spec.vcores),
                    max_restarts=0,
                    instances=task_spec.instances,
                    node_label=task_spec.label.value,
                    files={
                        **task_files, pyenv.dest_path: pyenv.path_to_archive
                    },
                    env=service_env)

            spec = skein.ApplicationSpec(services,
                                         queue=self.queue,
                                         file_systems=self.file_systems)
            try:
                client = skein.Client.from_global_daemon()
            except skein.exceptions.DaemonNotRunningError:
                client = skein.Client()

            task_instances = [(task_type, spec.instances)
                              for task_type, spec in task_specs.items()]
            events: Dict[str, Dict[str, str]] = \
                {task: {} for task in iter_tasks(task_instances)}
            app = client.submit_and_connect(spec)

            # Start a thread which collects all events posted by all tasks in kv store
            event_listener = Thread(target=_aggregate_events,
                                    args=(app.kv, events))
            event_listener.start()

            cluster_spec = _setup_cluster_tasks(task_instances, app)

            return SkeinCluster(client, app, task_instances, cluster_spec,
                                event_listener, events)
Beispiel #30
0
def client(security, kinit):
    with skein.Client(security=security) as client:
        yield client