Beispiel #1
0
    def __init__(self,
                 local_client_n_workers,
                 local_client_threads_per_worker,
                 yarn_client_n_workers,
                 yarn_client_worker_vcores,
                 yarn_client_worker_memory,
                 verbose=False):

        host_ip = get_host_ip_address()

        self.local_cluster = LocalCluster(
            n_workers=local_client_n_workers,
            threads_per_worker=local_client_threads_per_worker,
            processes=True,
            host=host_ip)
        self.local_client = Client(address=self.local_cluster, timeout='2s')

        self.yarn_cluster = YarnCluster(
            n_workers=yarn_client_n_workers,
            worker_vcores=yarn_client_worker_vcores,
            worker_memory=yarn_client_worker_memory,
            environment="python:///usr/bin/python3")
        self.yarn_client = Client(self.yarn_cluster)

        self.wait_container_resource_alloc()

        self.local_client_n_workers = local_client_n_workers
        self.yarn_client_n_workers = yarn_client_n_workers

        self.task_counter = -1
        self.yarn_client_n_workers = yarn_client_n_workers

        self.verbose = verbose
Beispiel #2
0
def test_from_application_id(skein_client, conda_env):
    with YarnCluster(
            environment=conda_env,
            worker_memory="512 MiB",
            scheduler_memory="512 MiB",
            name="test-from-application-id",
            skein_client=skein_client,
    ) as cluster:

        # Connect to the application with the application id
        cluster2 = YarnCluster.from_application_id(cluster.app_id,
                                                   skein_client)

        cluster2.scale(1)

        start = time.time()
        while len(cluster2.workers()) != 1:
            time.sleep(0.1)
            assert time.time() < start + 30, "timeout cluster.scale(1)"

        del cluster2

        # Cluster is still running, finalizer not run in cluster2
        assert len(cluster.workers()) == 1

    check_is_shutdown(skein_client, cluster.app_id)
Beispiel #3
0
def test_from_current(skein_client, conda_env, monkeypatch, tmpdir):
    # Not running in a container
    with pytest.raises(ValueError) as exc:
        YarnCluster.from_current()
    assert str(exc.value) == "Not running inside a container"

    with YarnCluster(
            environment=conda_env,
            worker_memory="512 MiB",
            scheduler_memory="512 MiB",
            name="test-from-current",
            skein_client=skein_client,
    ) as cluster:

        # Patch environment so it looks like a container
        container_id = "container_1526134340424_0012_01_000005"
        cont_dir = tmpdir.mkdir(container_id)
        with open(str(cont_dir.join(".skein.crt")), "wb") as fil:
            fil.write(skein_client.security._get_bytes("cert"))
        with open(str(cont_dir.join(".skein.pem")), "wb") as fil:
            fil.write(skein_client.security._get_bytes("key"))

        for key, val in [
            ("SKEIN_APPLICATION_ID", cluster.app_id),
            ("CONTAINER_ID", container_id),
            ("SKEIN_APPMASTER_ADDRESS", cluster.application_client.address),
            ("LOCAL_DIRS", str(tmpdir)),
        ]:
            monkeypatch.setenv(key, val)

        import skein.core

        monkeypatch.setattr(skein.core, "properties", skein.core.Properties())

        cluster2 = YarnCluster.from_current()
        assert cluster2.app_id == cluster.app_id
        assert cluster2.scheduler_address == cluster.scheduler_address

        # Smoketest method
        cluster2.scale(1)

        start = time.time()
        while len(cluster2.workers()) != 1:
            time.sleep(0.1)
            assert time.time() < start + 30, "timeout cluster.scale(1)"

        del cluster2

        # Cluster is still running, finalizer not run in cluster2
        assert len(cluster.workers()) == 1

    check_is_shutdown(skein_client, cluster.app_id)
def test_dask_yarn():
    try:
        from dask_yarn import YarnCluster
    except:
        return

    # Validate dask_yarn configuration
    cluster = YarnCluster()
    client = Client(cluster)

    cluster.scale(4)
    x = da.sum(np.ones(5))
    x.compute()
Beispiel #5
0
def test_from_specification_errors():
    bad_spec = skein.ApplicationSpec.from_yaml("""
        name: bad_spec
        services:
          bad:
            resources:
              memory: 1 GiB
              vcores: 1
            script: exit 1
        """)
    with pytest.raises(ValueError):
        YarnCluster.from_specification(bad_spec)

    with pytest.raises(TypeError):
        YarnCluster.from_specification(object())
Beispiel #6
0
def test_widget_and_html_reprs(skein_client, conda_env):
    pytest.importorskip("ipywidgets")

    with YarnCluster(
            environment=conda_env,
            deploy_mode="local",
            worker_memory="256 MiB",
            name="test-widget",
            skein_client=skein_client,
    ) as cluster:
        # Smoke test widget
        cluster._widget()

        # Test non-widget html repr
        assert cluster.app_id in cluster._repr_html_()

        assert "0" in cluster._widget_status()

        # Scale up and wait
        cluster.scale(1)
        start = time.time()
        while len(cluster._observed) != 1:
            time.sleep(0.1)
            assert time.time() < start + 30, "timeout cluster.scale(1)"

        assert "1" in cluster._widget_status()

        # Scale down
        cluster.scale(1)

    check_is_shutdown(skein_client, cluster.app_id)
Beispiel #7
0
def test_logs(conda_env, skein_client):
    with YarnCluster(
            environment=conda_env,
            deploy_mode="local",
            worker_memory="256 MiB",
            name="test-widget",
            skein_client=skein_client,
    ) as cluster:
        cluster.scale(2)
        start = time.time()
        while len(cluster._observed) != 2:
            time.sleep(0.1)
            assert time.time() < start + 30, "timeout cluster.scale(2)"

        logs = cluster.logs()
        assert len(logs) == 3

        logs = cluster.logs(scheduler=True, workers=False)
        assert len(logs) == 1

        logs = cluster.logs(scheduler=False, workers=False)
        assert len(logs) == 0

        logs = cluster.logs(scheduler=False, workers=True)
        assert len(logs) == 2

    check_is_shutdown(skein_client, cluster.app_id)
Beispiel #8
0
def test_basic(skein_client, conda_env):
    with YarnCluster(environment=conda_env,
                     worker_memory='512 MiB',
                     scheduler_memory='512 MiB',
                     name='test-basic',
                     skein_client=skein_client) as cluster:
        # Smoketest repr
        repr(cluster)

        # Scale up
        cluster.scale(2)

        with Client(cluster) as client:
            future = client.submit(inc, 10)
            assert future.result() == 11
            client.get_versions(check=True)

        # Check that 2 workers exist
        start = time.time()
        while len(cluster.workers()) != 2:
            time.sleep(0.1)
            assert time.time() < start + 5, "timeout cluster.scale(2)"

        # Scale down
        cluster.scale(1)

        start = time.time()
        while len(cluster.workers()) != 1:
            time.sleep(0.1)
            assert time.time() < start + 5, "timeout cluster.scale(1)"

    check_is_shutdown(skein_client, cluster.app_id)
Beispiel #9
0
def test_basic(deploy_mode, skein_client, conda_env):
    with YarnCluster(
            environment=conda_env,
            deploy_mode=deploy_mode,
            worker_memory="512 MiB",
            scheduler_memory="512 MiB",
            name="test-basic",
            skein_client=skein_client,
            dashboard_address=":8787",
            port=8786,
            worker_options={"resources": {
                "FOO": "BAZ"
            }},
            worker_class="dask.distributed.Nanny",
    ) as cluster:
        # Smoketest repr

        if bokeh_installed:
            assert cluster.dashboard_link is not None
            if deploy_mode == "local":
                assert ":8787" in cluster.dashboard_link

        if deploy_mode == "local":
            assert ":8786" in cluster.scheduler_address

        # Scale up
        cluster.scale(2)

        with Client(cluster) as client:
            future = client.submit(inc, 10)
            assert future.result() == 11
            client.get_versions(check=True)
            resource_tags = client.run(
                lambda dask_worker: dask_worker.total_resources)
            assert {"FOO": "BAZ"} in list(resource_tags.values())

        # Check that 2 workers exist
        start = time.time()
        while len(cluster.workers()) != 2:
            time.sleep(0.1)
            assert time.time() < start + 30, "timeout cluster.scale(2)"

        # Scale down
        cluster.scale(1)

        start = time.time()
        while len(cluster.workers()) != 1:
            time.sleep(0.1)
            assert time.time() < start + 30, "timeout cluster.scale(1)"

    check_is_shutdown(skein_client, cluster.app_id)
Beispiel #10
0
def do_postprocessing(s3_bucket, s3_bucket_prefix):

    fs = S3FileSystem()
    with fs.open(f'{s3_bucket}/{s3_bucket_prefix}/config.json', 'r') as f:
        cfg = json.load(f)

    ec2 = boto3.client('ec2')

    with open('/mnt/var/lib/info/job-flow.json', 'r') as f:
        job_flow_info = json.load(f)

    for instance_group in job_flow_info['instanceGroups']:
        if instance_group['instanceRole'].lower() == 'core':
            instance_type = instance_group['instanceType']
            instance_count = instance_group['requestedInstanceCount']

    instance_info = ec2.describe_instance_types(InstanceTypes=[instance_type])

    dask_worker_vcores = cfg['aws'].get('emr', {}).get('dask_worker_vcores', 2)
    instance_memory = instance_info['InstanceTypes'][0]['MemoryInfo'][
        'SizeInMiB']
    instance_ncpus = instance_info['InstanceTypes'][0]['VCpuInfo'][
        'DefaultVCpus']
    n_dask_workers = instance_count * instance_ncpus // dask_worker_vcores
    worker_memory = round(instance_memory / instance_ncpus *
                          dask_worker_vcores * 0.95)

    cluster = YarnCluster(deploy_mode='local',
                          worker_vcores=dask_worker_vcores,
                          worker_memory='{} MiB'.format(worker_memory),
                          n_workers=n_dask_workers)

    client = Client(cluster)  # noqa E841

    results_s3_loc = f'{s3_bucket}/{s3_bucket_prefix}/results'

    combine_results(fs, results_s3_loc, cfg)

    aws_conf = cfg.get('postprocessing', {}).get('aws', {})
    if 'athena' in aws_conf:
        tbl_prefix = s3_bucket_prefix.split('/')[-1]
        if not tbl_prefix:
            tbl_prefix = cfg['aws']['job_identifier']
        create_athena_tables(aws_conf, tbl_prefix, s3_bucket,
                             f'{s3_bucket_prefix}/results/parquet')

    keep_individual_timeseries = cfg.get('postprocessing',
                                         {}).get('keep_individual_timeseries',
                                                 False)
    remove_intermediate_files(fs, results_s3_loc, keep_individual_timeseries)
Beispiel #11
0
def test_from_specification(skein_client, conda_env, tmpdir, loop):
    spec = _make_specification(environment=conda_env,
                               worker_memory='512 MB',
                               scheduler_memory='512 MB',
                               name=APPNAME)
    fn = os.path.join(str(tmpdir), 'spec.yaml')
    with open(fn, 'w') as f:
        f.write(spec.to_yaml())

    with YarnCluster.from_specification(fn, skein_client=skein_client) as cluster:
        with Client(cluster, loop=loop):
            pass

    check_is_shutdown(skein_client, cluster.app_id)
Beispiel #12
0
def test_from_specification(skein_client, conda_env, tmpdir):
    spec = _make_specification(
        environment=conda_env,
        worker_memory="512 MiB",
        scheduler_memory="512 MiB",
        name="dask-yarn-test-from-specification",
    )
    fn = os.path.join(str(tmpdir), "spec.yaml")
    with open(fn, "w") as f:
        f.write(spec.to_yaml())

    with YarnCluster.from_specification(fn,
                                        skein_client=skein_client) as cluster:
        with Client(cluster):
            pass

    check_is_shutdown(skein_client, cluster.app_id)
Beispiel #13
0
def test_basic(deploy_mode, skein_client, conda_env):
    with YarnCluster(
            environment=conda_env,
            deploy_mode=deploy_mode,
            worker_memory="512 MiB",
            scheduler_memory="512 MiB",
            name="test-basic",
            skein_client=skein_client,
            dashboard_address=":8787",
            port=8786,
    ) as cluster:
        # Smoketest repr
        repr(cluster)

        if bokeh_installed:
            assert cluster.dashboard_link is not None
            if deploy_mode == "local":
                assert ":8787" in cluster.dashboard_link

        if deploy_mode == "local":
            assert ":8786" in cluster.scheduler_address

        # Scale up
        cluster.scale(2)

        with Client(cluster) as client:
            future = client.submit(inc, 10)
            assert future.result() == 11
            client.get_versions(check=True)

        # Check that 2 workers exist
        start = time.time()
        while len(cluster.workers()) != 2:
            time.sleep(0.1)
            assert time.time() < start + 30, "timeout cluster.scale(2)"

        # Scale down
        cluster.scale(1)

        start = time.time()
        while len(cluster.workers()) != 1:
            time.sleep(0.1)
            assert time.time() < start + 30, "timeout cluster.scale(1)"

    check_is_shutdown(skein_client, cluster.app_id)
Beispiel #14
0
def test_basic(skein_client, conda_env, loop):
    with YarnCluster(environment=conda_env,
                     worker_memory='512 MB',
                     scheduler_memory='512 MB',
                     name=APPNAME,
                     skein_client=skein_client) as cluster:
        cluster.scale(2)
        with Client(cluster, loop=loop) as client:
            future = client.submit(inc, 10)
            assert future.result() == 11

            start = time.time()
            while len(client.scheduler_info()['workers']) < 2:
                time.sleep(0.1)
                assert time.time() < start + 5

            client.get_versions(check=True)

    check_is_shutdown(skein_client, cluster.app_id)
Beispiel #15
0
def test_adapt(skein_client, conda_env):
    with YarnCluster(
            environment=conda_env,
            deploy_mode="local",
            worker_memory="256 MiB",
            name="test-adapt",
            skein_client=skein_client,
    ) as cluster:

        cluster.adapt()

        with Client(cluster) as client:
            future = client.submit(lambda x: x + 1, 10)
            result = future.result()
            assert result == 11

        start = time.time()
        while cluster.workers():
            time.sleep(0.1)
            assert time.time() < start + 30, "auto-scaledown timeout"

    check_is_shutdown(skein_client, cluster.app_id)
Beispiel #16
0
async def test_basic_async(deploy_mode, skein_client, conda_env):
    async with YarnCluster(
            environment=conda_env,
            deploy_mode=deploy_mode,
            worker_memory="512 MiB",
            scheduler_memory="512 MiB",
            name="test-basic-async",
            skein_client=skein_client,
            asynchronous=True,
    ) as cluster:
        # Smoketest repr
        repr(cluster)

        if bokeh_installed:
            assert cluster.dashboard_link is not None

        # Scale up
        await cluster.scale(2)

        async with Client(cluster, asynchronous=True) as client:
            result = await client.submit(inc, 10)
            assert result == 11
            await client.get_versions(check=True)

        # Check that 2 workers exist
        start = time.time()
        while len(await cluster.workers()) != 2:
            await asyncio.sleep(0.1)
            assert time.time() < start + 30, "timeout cluster.scale(2)"

        # Scale down
        await cluster.scale(1)

        start = time.time()
        while len(await cluster.workers()) != 1:
            await asyncio.sleep(0.1)
            assert time.time() < start + 30, "timeout cluster.scale(1)"

    check_is_shutdown(skein_client, cluster.app_id)
Beispiel #17
0
    def execute(pipeline_context, execution_plan):
        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)

        dask_config = pipeline_context.executor_config

        check.param_invariant(
            isinstance(pipeline_context.executor_config, DaskConfig),
            'pipeline_context',
            'Expected executor_config to be DaskConfig got {}'.format(
                pipeline_context.executor_config),
        )

        # Checks to ensure storage is compatible with Dask configuration
        storage = pipeline_context.environment_dict.get('storage')
        check.invariant(storage.keys(),
                        'Must specify storage to use Dask execution')

        check.invariant(
            pipeline_context.instance.is_persistent,
            'Dask execution requires a persistent DagsterInstance',
        )

        # https://github.com/dagster-io/dagster/issues/2440
        check.invariant(
            pipeline_context.system_storage_def.is_persistent,
            'Cannot use in-memory storage with Dask, use filesystem, S3, or GCS',
        )

        step_levels = execution_plan.execution_step_levels()

        pipeline_name = pipeline_context.pipeline_def.name

        instance = pipeline_context.instance

        cluster_type = dask_config.cluster_type
        if cluster_type == 'local':
            from dask.distributed import LocalCluster

            cluster = LocalCluster(**dask_config.build_dict(pipeline_name))
        elif cluster_type == 'yarn':
            from dask_yarn import YarnCluster

            cluster = YarnCluster(**dask_config.build_dict(pipeline_name))
        elif cluster_type == 'ssh':
            from dask.distributed import SSHCluster

            cluster = SSHCluster(**dask_config.build_dict(pipeline_name))
        elif cluster_type == 'pbs':
            from dask_jobqueue import PBSCluster

            cluster = PBSCluster(**dask_config.build_dict(pipeline_name))
        elif cluster_type == 'kube':
            from dask_kubernetes import KubeCluster

            cluster = KubeCluster(**dask_config.build_dict(pipeline_name))
        else:
            raise ValueError(
                f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'kube') not {cluster_type}"
            )

        with dask.distributed.Client(cluster) as client:
            execution_futures = []
            execution_futures_dict = {}

            for step_level in step_levels:
                for step in step_level:
                    # We ensure correctness in sequencing by letting Dask schedule futures and
                    # awaiting dependencies within each step.
                    dependencies = []
                    for step_input in step.step_inputs:
                        for key in step_input.dependency_keys:
                            dependencies.append(execution_futures_dict[key])

                    environment_dict = dict(pipeline_context.environment_dict,
                                            execution={'in_process': {}})
                    variables = {
                        'executionParams': {
                            'selector': {
                                'name': pipeline_name
                            },
                            'runConfigData': environment_dict,
                            'mode': pipeline_context.mode_def.name,
                            'executionMetadata': {
                                'runId': pipeline_context.pipeline_run.run_id
                            },
                            'stepKeys': [step.key],
                        }
                    }

                    dask_task_name = '%s.%s' % (pipeline_name, step.key)

                    future = client.submit(
                        query_on_dask_worker,
                        pipeline_context.pipeline.
                        get_reconstructable_repository(),
                        variables,
                        dependencies,
                        instance.get_ref(),
                        key=dask_task_name,
                        resources=get_dask_resource_requirements(step.tags),
                    )

                    execution_futures.append(future)
                    execution_futures_dict[step.key] = future

            # This tells Dask to awaits the step executions and retrieve their results to the
            # master
            for future in dask.distributed.as_completed(execution_futures):
                for step_event in future.result():
                    check.inst(step_event, DagsterEvent)

                    yield step_event
Beispiel #18
0
        tf_addrs[i] = "%s:%d" % (tf_ip, ports[tf_ip])
    tf_spec = {'ps': tf_addrs[0:nps], 'worker': tf_addrs[nps:n_tf_servers]}
    return tf_spec, dask_spec


if __name__ == "__main__":
    mdl_args_str = sys.argv[1]
    env_pack_path = sys.argv[2]
    cluster_mode = 'local'  # default cluster mode is local
    if len(sys.argv) >= 4 and sys.argv[3] == 'remote':
        cluster_mode = 'remote'

    # create dask cluster from yarn or local
    cluster = YarnCluster(environment=env_pack_path,
                          n_workers=2,
                          worker_vcores=1,
                          worker_memory='500MiB',
                          deploy_mode=cluster_mode)
    time.sleep(3)  # wait a while for cluster setup finish

    # create spec for tf servers
    nps, nworker = 1, 1
    tf_spec, dask_spec = create_spec(cluster, nps=nps, nworker=nworker)
    print(tf_spec, dask_spec)
    # create client for cluster to submit job
    client = Client(cluster)

    # submit job
    ps_jobs = [
        client.submit(tf_job,
                      tf_spec,
from dask_yarn import YarnCluster
from dask.distributed import Client
import dask.array as da

import numpy as np

cluster = YarnCluster()
client = Client(cluster)

x = da.sum(np.ones(5))
x.compute()
Beispiel #20
0
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %%
import dask.dataframe as dd
import pandas as pd
from datetime import datetime
import numpy as np

# Création d'un client pour les outils de visualisation
from dask.distributed import Client, progress
from dask_yarn import YarnCluster

cluster = YarnCluster()
client = Client(cluster)

cluster.adapt()

client

# %% [markdown]
# # Chargement des dataset
# %% [markdown]
# ## YouGov - Wearing Mask in public

# %%
start = datetime.now()

##Chargement dataset
df = dd.read_csv("gs://dask-vs-koalas/wearing_face_mask_public.csv", sep=";")

##Transformation du dataset = 1 ligne par date/pays
Beispiel #21
0
from dask_yarn import YarnCluster
from dask.distributed import Client
import dask.dataframe as dd
import gcsfs
# Create a cluster where each worker has two cores and eight GB of memory
cluster = YarnCluster(environment='environnement.tar.gz',
                      worker_vcores=2,
                      worker_memory="3GB")
# Scale out to ten such workers
cluster.scale(2)
# Connect to the cluster
client = Client(cluster)

df = dd.read_parquet('hdfs:///raw/sample.parquet', engine="pyarrow")
df = df.persist()
print(df)



# gcloud compute ssh hadoop-formation	-m --project=after-yesterday-217007 -- -L 1080:hadoop-formation	-m:8088 -N -n
Beispiel #22
0
class DualClientFuture():
    def __init__(self,
                 local_client_n_workers,
                 local_client_threads_per_worker,
                 yarn_client_n_workers,
                 yarn_client_worker_vcores,
                 yarn_client_worker_memory,
                 verbose=False):

        host_ip = get_host_ip_address()

        self.local_cluster = LocalCluster(
            n_workers=local_client_n_workers,
            threads_per_worker=local_client_threads_per_worker,
            processes=True,
            host=host_ip)
        self.local_client = Client(address=self.local_cluster, timeout='2s')

        self.yarn_cluster = YarnCluster(
            n_workers=yarn_client_n_workers,
            worker_vcores=yarn_client_worker_vcores,
            worker_memory=yarn_client_worker_memory,
            environment="python:///usr/bin/python3")
        self.yarn_client = Client(self.yarn_cluster)

        self.wait_container_resource_alloc()

        self.local_client_n_workers = local_client_n_workers
        self.yarn_client_n_workers = yarn_client_n_workers

        self.task_counter = -1
        self.yarn_client_n_workers = yarn_client_n_workers

        self.verbose = verbose

    def wait_container_resource_alloc(self):

        while True:

            waiting_containers = [
                yarn_container_obj
                for yarn_container_obj in self.yarn_cluster.workers()
                if str(yarn_container_obj.state) == 'WAITING'
            ]

            if len(waiting_containers) == 0:
                break

            time.sleep(1.0)

    def submit(self, func, *args, **kwargs):

        if self.verbose == True:
            print('total n workers: {}'.format(self.local_client_n_workers +
                                               self.yarn_client_n_workers))

        self.task_counter += 1

        remainder = self.task_counter % (self.local_client_n_workers +
                                         self.yarn_client_n_workers)

        # if remainder <= (self.local_client_n_workers-1):

        #     if self.verbose==True:
        #         print('remainder: {}, n_local_worker: {}, running on local'.format(remainder, self.local_client_n_workers))

        #     future = self.local_client.submit(func, *args, **kwargs)
        # else:

        #     if self.verbose==True:
        #         print('remainder: {}, n_local_worker: {}, running on remote'.format(remainder, self.local_client_n_workers))

        #     func = yarn_directory_normalizer(func)
        #     future = self.yarn_client.submit(func, None, *args, **kwargs)

        if remainder <= (self.yarn_client_n_workers - 1):

            if self.verbose == True:
                print('remainder: {}, n_local_worker: {}, running on remote'.
                      format(remainder, self.local_client_n_workers))

            func = yarn_directory_normalizer(func)
            future = self.yarn_client.submit(func, None, *args, **kwargs)

        else:

            if self.verbose == True:
                print('remainder: {}, n_local_worker: {}, running on local'.
                      format(remainder, self.local_client_n_workers))

            future = self.local_client.submit(func, *args, **kwargs)

        return future.result()

    def get_worker_ip_addresses(self):

        while True:
            yarn_container_objects = self.yarn_cluster.workers()
            if len(yarn_container_objects) == self.yarn_client_n_workers:
                break
            time.sleep(0.1)

        ip_addrs = set()

        for yarn_container_object in yarn_container_objects:
            ip_addrs.add(
                yarn_container_object.yarn_node_http_address.split('.')
                [0].replace('-', '.')[3:])

        return list(ip_addrs)

    def submit_per_node(self, func, *args, **kwargs):

        func = yarn_directory_normalizer(func)

        ip_addrs = self.get_worker_ip_addresses()

        futures = list()

        for ip_addr in ip_addrs:
            futures.append(
                self.yarn_client.submit(func,
                                        ip_addr,
                                        *args,
                                        **kwargs,
                                        workers=ip_addr))

        return self.yarn_client.gather(futures)

    def get_dashboard_link(self):

        print('local cluster: ', self.local_cluster.dashboard_link)
        print('yarn cluster:  ', self.yarn_cluster.dashboard_link)
import time
from dask_yarn import YarnCluster
from dask.distributed import Client

# Create a cluster
cluster = YarnCluster()

# Connect to the cluster
# client = Client(cluster)

cluster.scale(2)

time.sleep(100000)
Beispiel #24
0
    def execute(self, pipeline_context, execution_plan):
        check.inst_param(pipeline_context, "pipeline_context",
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, "execution_plan", ExecutionPlan)
        check.param_invariant(
            isinstance(pipeline_context.executor, DaskExecutor),
            "pipeline_context",
            "Expected executor to be DaskExecutor got {}".format(
                pipeline_context.executor),
        )

        check.invariant(
            pipeline_context.instance.is_persistent,
            "Dask execution requires a persistent DagsterInstance",
        )

        step_levels = execution_plan.execution_step_levels()

        pipeline_name = pipeline_context.pipeline_def.name

        instance = pipeline_context.instance

        cluster_type = self.cluster_type
        if cluster_type == "local":
            from dask.distributed import LocalCluster

            cluster = LocalCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "yarn":
            from dask_yarn import YarnCluster

            cluster = YarnCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "ssh":
            from dask.distributed import SSHCluster

            cluster = SSHCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "pbs":
            from dask_jobqueue import PBSCluster

            cluster = PBSCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "moab":
            from dask_jobqueue import MoabCluster

            cluster = MoabCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "sge":
            from dask_jobqueue import SGECluster

            cluster = SGECluster(**self.build_dict(pipeline_name))
        elif cluster_type == "lsf":
            from dask_jobqueue import LSFCluster

            cluster = LSFCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "slurm":
            from dask_jobqueue import SLURMCluster

            cluster = SLURMCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "oar":
            from dask_jobqueue import OARCluster

            cluster = OARCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "kube":
            from dask_kubernetes import KubeCluster

            cluster = KubeCluster(**self.build_dict(pipeline_name))
        else:
            raise ValueError(
                f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}"
            )

        with dask.distributed.Client(cluster) as client:
            execution_futures = []
            execution_futures_dict = {}

            for step_level in step_levels:
                for step in step_level:
                    # We ensure correctness in sequencing by letting Dask schedule futures and
                    # awaiting dependencies within each step.
                    dependencies = []
                    for step_input in step.step_inputs:
                        for key in step_input.dependency_keys:
                            dependencies.append(execution_futures_dict[key])

                    run_config = dict(pipeline_context.run_config,
                                      execution={"in_process": {}})
                    recon_repo = pipeline_context.pipeline.get_reconstructable_repository(
                    )

                    dask_task_name = "%s.%s" % (pipeline_name, step.key)

                    recon_pipeline = recon_repo.get_reconstructable_pipeline(
                        pipeline_name)

                    future = client.submit(
                        query_on_dask_worker,
                        dependencies,
                        recon_pipeline,
                        pipeline_context.pipeline_run,
                        run_config,
                        [step.key],
                        pipeline_context.mode_def.name,
                        instance.get_ref(),
                        key=dask_task_name,
                        resources=get_dask_resource_requirements(step.tags),
                    )

                    execution_futures.append(future)
                    execution_futures_dict[step.key] = future

            # This tells Dask to awaits the step executions and retrieve their results to the
            # master
            futures = dask.distributed.as_completed(execution_futures,
                                                    with_results=True)

            # Allow interrupts while waiting for the results from Dask
            for future, result in iterate_with_context(
                    raise_interrupts_immediately, futures):
                for step_event in result:
                    check.inst(step_event, DagsterEvent)
                    yield step_event
Beispiel #25
0
    def execute(self, pipeline_context, execution_plan):
        check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext)
        check.inst_param(execution_plan, "execution_plan", ExecutionPlan)
        check.param_invariant(
            isinstance(pipeline_context.executor, DaskExecutor),
            "pipeline_context",
            "Expected executor to be DaskExecutor got {}".format(pipeline_context.executor),
        )

        # Checks to ensure storage is compatible with Dask configuration
        storage = pipeline_context.run_config.get("storage")
        check.invariant(storage.keys(), "Must specify storage to use Dask execution")

        check.invariant(
            pipeline_context.instance.is_persistent,
            "Dask execution requires a persistent DagsterInstance",
        )

        # https://github.com/dagster-io/dagster/issues/2440
        check.invariant(
            pipeline_context.system_storage_def.is_persistent,
            "Cannot use in-memory storage with Dask, use filesystem, S3, or GCS",
        )

        step_levels = execution_plan.execution_step_levels()

        pipeline_name = pipeline_context.pipeline_def.name

        instance = pipeline_context.instance

        cluster_type = self.cluster_type
        if cluster_type == "local":
            from dask.distributed import LocalCluster

            cluster = LocalCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "yarn":
            from dask_yarn import YarnCluster

            cluster = YarnCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "ssh":
            from dask.distributed import SSHCluster

            cluster = SSHCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "pbs":
            from dask_jobqueue import PBSCluster

            cluster = PBSCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "moab":
            from dask_jobqueue import MoabCluster

            cluster = MoabCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "sge":
            from dask_jobqueue import SGECluster

            cluster = SGECluster(**self.build_dict(pipeline_name))
        elif cluster_type == "lsf":
            from dask_jobqueue import LSFCluster

            cluster = LSFCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "slurm":
            from dask_jobqueue import SLURMCluster

            cluster = SLURMCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "oar":
            from dask_jobqueue import OARCluster

            cluster = OARCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "kube":
            from dask_kubernetes import KubeCluster

            cluster = KubeCluster(**self.build_dict(pipeline_name))
        else:
            raise ValueError(
                f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}"
            )

        with dask.distributed.Client(cluster) as client:
            execution_futures = []
            execution_futures_dict = {}

            for step_level in step_levels:
                for step in step_level:
                    # We ensure correctness in sequencing by letting Dask schedule futures and
                    # awaiting dependencies within each step.
                    dependencies = []
                    for step_input in step.step_inputs:
                        for key in step_input.dependency_keys:
                            dependencies.append(execution_futures_dict[key])

                    run_config = dict(pipeline_context.run_config, execution={"in_process": {}})
                    recon_repo = pipeline_context.pipeline.get_reconstructable_repository()
                    variables = {
                        "executionParams": {
                            "selector": {
                                "pipelineName": pipeline_name,
                                "repositoryName": recon_repo.get_definition().name,
                                "repositoryLocationName": "<<in_process>>",
                            },
                            "runConfigData": run_config,
                            "mode": pipeline_context.mode_def.name,
                            "executionMetadata": {"runId": pipeline_context.pipeline_run.run_id},
                            "stepKeys": [step.key],
                        }
                    }

                    dask_task_name = "%s.%s" % (pipeline_name, step.key)

                    workspace = create_in_process_ephemeral_workspace(
                        pointer=pipeline_context.pipeline.get_reconstructable_repository().pointer
                    )

                    future = client.submit(
                        query_on_dask_worker,
                        workspace,
                        variables,
                        dependencies,
                        instance.get_ref(),
                        key=dask_task_name,
                        resources=get_dask_resource_requirements(step.tags),
                    )

                    execution_futures.append(future)
                    execution_futures_dict[step.key] = future

            # This tells Dask to awaits the step executions and retrieve their results to the
            # master
            for future in dask.distributed.as_completed(execution_futures):
                for step_event in future.result():
                    check.inst(step_event, DagsterEvent)

                    yield step_event