Exemple #1
0
    def execute(self, pipeline_context, execution_plan):
        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)
        check.param_invariant(
            isinstance(pipeline_context.executor, DaskExecutor),
            'pipeline_context',
            'Expected executor to be DaskExecutor got {}'.format(
                pipeline_context.executor),
        )

        # Checks to ensure storage is compatible with Dask configuration
        storage = pipeline_context.run_config.get('storage')
        check.invariant(storage.keys(),
                        'Must specify storage to use Dask execution')

        check.invariant(
            pipeline_context.instance.is_persistent,
            'Dask execution requires a persistent DagsterInstance',
        )

        # https://github.com/dagster-io/dagster/issues/2440
        check.invariant(
            pipeline_context.system_storage_def.is_persistent,
            'Cannot use in-memory storage with Dask, use filesystem, S3, or GCS',
        )

        step_levels = execution_plan.execution_step_levels()

        pipeline_name = pipeline_context.pipeline_def.name

        instance = pipeline_context.instance

        cluster_type = self.cluster_type
        if cluster_type == 'local':
            from dask.distributed import LocalCluster

            cluster = LocalCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'yarn':
            from dask_yarn import YarnCluster

            cluster = YarnCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'ssh':
            from dask.distributed import SSHCluster

            cluster = SSHCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'pbs':
            from dask_jobqueue import PBSCluster

            cluster = PBSCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'moab':
            from dask_jobqueue import MoabCluster

            cluster = MoabCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'sge':
            from dask_jobqueue import SGECluster

            cluster = SGECluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'lsf':
            from dask_jobqueue import LSFCluster

            cluster = LSFCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'slurm':
            from dask_jobqueue import SLURMCluster

            cluster = SLURMCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'oar':
            from dask_jobqueue import OARCluster

            cluster = OARCluster(**self.build_dict(pipeline_name))
        elif cluster_type == 'kube':
            from dask_kubernetes import KubeCluster

            cluster = KubeCluster(**self.build_dict(pipeline_name))
        else:
            raise ValueError(
                f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}"
            )

        with dask.distributed.Client(cluster) as client:
            execution_futures = []
            execution_futures_dict = {}

            for step_level in step_levels:
                for step in step_level:
                    # We ensure correctness in sequencing by letting Dask schedule futures and
                    # awaiting dependencies within each step.
                    dependencies = []
                    for step_input in step.step_inputs:
                        for key in step_input.dependency_keys:
                            dependencies.append(execution_futures_dict[key])

                    run_config = dict(pipeline_context.run_config,
                                      execution={'in_process': {}})
                    recon_repo = pipeline_context.pipeline.get_reconstructable_repository(
                    )
                    variables = {
                        'executionParams': {
                            'selector': {
                                'pipelineName': pipeline_name,
                                'repositoryName':
                                recon_repo.get_definition().name,
                                'repositoryLocationName': '<<in_process>>',
                            },
                            'runConfigData': run_config,
                            'mode': pipeline_context.mode_def.name,
                            'executionMetadata': {
                                'runId': pipeline_context.pipeline_run.run_id
                            },
                            'stepKeys': [step.key],
                        }
                    }

                    dask_task_name = '%s.%s' % (pipeline_name, step.key)

                    workspace = create_in_process_ephemeral_workspace(
                        pointer=pipeline_context.pipeline.
                        get_reconstructable_repository().pointer)

                    future = client.submit(
                        query_on_dask_worker,
                        workspace,
                        variables,
                        dependencies,
                        instance.get_ref(),
                        key=dask_task_name,
                        resources=get_dask_resource_requirements(step.tags),
                    )

                    execution_futures.append(future)
                    execution_futures_dict[step.key] = future

            # This tells Dask to awaits the step executions and retrieve their results to the
            # master
            for future in dask.distributed.as_completed(execution_futures):
                for step_event in future.result():
                    check.inst(step_event, DagsterEvent)

                    yield step_event
Exemple #2
0
    def execute(self, pipeline_context, execution_plan):
        check.inst_param(pipeline_context, "pipeline_context",
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, "execution_plan", ExecutionPlan)
        check.param_invariant(
            isinstance(pipeline_context.executor, DaskExecutor),
            "pipeline_context",
            "Expected executor to be DaskExecutor got {}".format(
                pipeline_context.executor),
        )

        check.invariant(
            pipeline_context.instance.is_persistent,
            "Dask execution requires a persistent DagsterInstance",
        )

        step_levels = execution_plan.execution_step_levels()

        pipeline_name = pipeline_context.pipeline_def.name

        instance = pipeline_context.instance

        cluster_type = self.cluster_type
        if cluster_type == "local":
            from dask.distributed import LocalCluster

            cluster = LocalCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "yarn":
            from dask_yarn import YarnCluster

            cluster = YarnCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "ssh":
            from dask.distributed import SSHCluster

            cluster = SSHCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "pbs":
            from dask_jobqueue import PBSCluster

            cluster = PBSCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "moab":
            from dask_jobqueue import MoabCluster

            cluster = MoabCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "sge":
            from dask_jobqueue import SGECluster

            cluster = SGECluster(**self.build_dict(pipeline_name))
        elif cluster_type == "lsf":
            from dask_jobqueue import LSFCluster

            cluster = LSFCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "slurm":
            from dask_jobqueue import SLURMCluster

            cluster = SLURMCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "oar":
            from dask_jobqueue import OARCluster

            cluster = OARCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "kube":
            from dask_kubernetes import KubeCluster

            cluster = KubeCluster(**self.build_dict(pipeline_name))
        else:
            raise ValueError(
                f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}"
            )

        with dask.distributed.Client(cluster) as client:
            execution_futures = []
            execution_futures_dict = {}

            for step_level in step_levels:
                for step in step_level:
                    # We ensure correctness in sequencing by letting Dask schedule futures and
                    # awaiting dependencies within each step.
                    dependencies = []
                    for step_input in step.step_inputs:
                        for key in step_input.dependency_keys:
                            dependencies.append(execution_futures_dict[key])

                    run_config = dict(pipeline_context.run_config,
                                      execution={"in_process": {}})
                    recon_repo = pipeline_context.pipeline.get_reconstructable_repository(
                    )

                    dask_task_name = "%s.%s" % (pipeline_name, step.key)

                    recon_pipeline = recon_repo.get_reconstructable_pipeline(
                        pipeline_name)

                    future = client.submit(
                        query_on_dask_worker,
                        dependencies,
                        recon_pipeline,
                        pipeline_context.pipeline_run,
                        run_config,
                        [step.key],
                        pipeline_context.mode_def.name,
                        instance.get_ref(),
                        key=dask_task_name,
                        resources=get_dask_resource_requirements(step.tags),
                    )

                    execution_futures.append(future)
                    execution_futures_dict[step.key] = future

            # This tells Dask to awaits the step executions and retrieve their results to the
            # master
            futures = dask.distributed.as_completed(execution_futures,
                                                    with_results=True)

            # Allow interrupts while waiting for the results from Dask
            for future, result in iterate_with_context(
                    raise_interrupts_immediately, futures):
                for step_event in result:
                    check.inst(step_event, DagsterEvent)
                    yield step_event
def run_dask(options: dict,
        docker_username: str = None,
        docker_password: str = None,
        docker: bool = False,
        slurm_job_array: bool = False):
    try:
        if 'jobqueue' not in options:
            cluster = LocalCluster()
        else:
            jobqueue = options['jobqueue']
            gpus = options['gpus'] if 'gpus' in options else 0
            if 'slurm' in jobqueue:
                print("Requesting SLURM cluster:")
                pprint(jobqueue['slurm'])
                cluster = SLURMCluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['slurm']) if gpus else SLURMCluster(**jobqueue['slurm'])
            elif 'pbs' in jobqueue:
                print("Requesting PBS cluster:")
                pprint(jobqueue['pbs'])
                cluster = PBSCluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['pbs']) if gpus else PBSCluster(**jobqueue['pbs'])
            elif 'moab' in jobqueue:
                print("Requesting MOAB cluster:")
                pprint(jobqueue['moab'])
                cluster = MoabCluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['moab']) if gpus else MoabCluster(**jobqueue['moab'])
            elif 'sge' in jobqueue:
                print("Requesting SGE cluster:")
                pprint(jobqueue['sge'])
                cluster = SGECluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['sge']) if gpus else SGECluster(**jobqueue['sge'])
            elif 'lsf' in jobqueue:
                print("Requesting LSF cluster:")
                pprint(jobqueue['lsf'])
                cluster = LSFCluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['lsf']) if gpus else LSFCluster(**jobqueue['lsf'])
            elif 'oar' in jobqueue:
                print("Requesting OAR cluster:")
                pprint(jobqueue['oar'])
                cluster = OARCluster(job_extra=[f"--gres=gpu:{gpus}"], **jobqueue['oar']) if gpus else OARCluster(**jobqueue['oar'])
            else:
                raise ValueError(f"Unsupported jobqueue configuration: {jobqueue}")

            print(f"Cluster job script: {cluster.job_script()}")

        if 'output' in options and 'from' in options['output']: output_path = options['output']['from']
        else: output_path = '.'

        if 'input' not in options:
            env = options['env'] if 'env' in options else []
            params = options['parameters'] if 'parameters' in options else []
            bind_mounts = options['bind_mounts'] if 'bind_mounts' in options else []
            no_cache = options['no_cache'] if 'no_cache' in options else False
            gpus = options['gpus'] if 'gpus' in options else 0

            if 'jobqueue' in options: cluster.scale(1)
            with Client(cluster) as client:
                command = prep_command(
                    work_dir=options['workdir'],
                    image=options['image'],
                    command=options['command'],
                    env=env + [{'key': 'INDEX', 'value': 1}],
                    parameters=params + [{'key': 'OUTPUT', 'value': output_path}],
                    bind_mounts=bind_mounts,
                    no_cache=no_cache,
                    gpus=gpus,
                    docker_username=docker_username,
                    docker_password=docker_password,
                    docker=docker)

                logger.info(f"Submitting container")
                future = submit_command(client, command, options['log_file'] if 'log_file' in options else None, 3)
                future.result()
                if future.status != 'finished':
                    logger.error(f"Container failed: {future.exception}")
                else:
                    logger.info(f"Container completed")
        elif options['input']['kind'] == InputKind.DIRECTORY:
            input_path = options['input']['path']
            env = options['env'] if 'env' in options else []
            params = options['parameters'] if 'parameters' in options else []
            bind_mounts = options['bind_mounts'] if 'bind_mounts' in options else []
            no_cache = options['no_cache'] if 'no_cache' in options else False
            gpus = options['gpus'] if 'gpus' in options else 0

            if 'jobqueue' in options: cluster.scale(1)
            with Client(cluster) as client:
                command = prep_command(
                    work_dir=options['workdir'],
                    image=options['image'],
                    command=options['command'],
                    env=env + [{'key': 'INDEX', 'value': 1}],
                    parameters=params + [{'key': 'INPUT', 'value': input_path}, {'key': 'OUTPUT', 'value': output_path}],
                    bind_mounts=bind_mounts,
                    no_cache=no_cache,
                    gpus=gpus,
                    docker_username=docker_username,
                    docker_password=docker_password,
                    docker=docker)

                logger.info(f"Submitting container for directory '{input_path}'")
                future = submit_command(client, command, options['log_file'] if 'log_file' in options else None, 3)
                future.result()
                if future.status != 'finished':
                    logger.error(f"Container failed for directory '{input_path}': {future.exception}")
                else:
                    logger.info(f"Container completed for directory '{input_path}'")
        elif options['input']['kind'] == InputKind.FILES:
            input_path = options['input']['path']
            if slurm_job_array:
                files = os.listdir(input_path)
                file_id = int(os.environ.get('SLURM_ARRAY_TASK_ID'))
                current_file = files[file_id]

                env = options['env'] if 'env' in options else []
                params = options['parameters'] if 'parameters' in options else []
                patterns = options['input']['patterns'] if 'patterns' in options['input'] else []
                bind_mounts = options['bind_mounts'] if 'bind_mounts' in options else []
                no_cache = options['no_cache'] if 'no_cache' in options else False
                gpus = options['gpus'] if 'gpus' in options else 0

                if 'jobqueue' in options: cluster.scale(1)
                with Client(cluster) as client:
                    command = prep_command(
                        work_dir=options['workdir'],
                        image=options['image'],
                        command=options['command'],
                        env=env + [{'key': 'INDEX', 'value': file_id}] + [{'key': 'PATTERNS', 'value': ','.join(patterns)}],
                        parameters=params + [{'key': 'INPUT', 'value': join(input_path, current_file)}, {'key': 'OUTPUT', 'value': output_path}],
                        bind_mounts=bind_mounts,
                        no_cache=no_cache,
                        gpus=gpus,
                        docker_username=docker_username,
                        docker_password=docker_password,
                        docker=docker)

                    logger.info(f"Submitting container for file '{input_path}'")
                    future = submit_command(client, command, options['log_file'] if 'log_file' in options else None, 3)
                    future.result()
                    if future.status != 'finished':
                        logger.error(f"Container failed for file '{input_path}': {future.exception}")
                    else:
                        logger.info(f"Container completed for file '{input_path}'")

                logger.info(f"Run succeeded")
            else:
                files = os.listdir(input_path)
                count = len(files)
                futures = []

                if 'jobqueue' not in options:
                    logger.info(f"Processing {count} files in '{input_path}'")
                else:
                    logger.info(f"Requesting {count} nodes to process {count} files in '{input_path}' with job script:\n{cluster.job_script()}")
                    cluster.scale(count)

                env = options['env'] if 'env' in options else []
                params = deepcopy(options['parameters']) if 'parameters' in options else []
                patterns = options['input']['patterns'] if 'patterns' in options['input'] else []
                bind_mounts = options['bind_mounts'] if 'bind_mounts' in options else []
                no_cache = options['no_cache'] if 'no_cache' in options else False
                gpus = options['gpus'] if 'gpus' in options else 0

                with Client(cluster) as client:
                    num_files = len(files)
                    for i, current_file in tqdm.tqdm(enumerate(files), total=num_files):
                        command = prep_command(
                            work_dir=options['workdir'],
                            image=options['image'],
                            command=options['command'],
                            env=env + [{'key': 'INDEX', 'value': i}] + [{'key': 'PATTERNS', 'value': ','.join(patterns)}],
                            parameters=params + [{'key': 'INPUT', 'value': join(input_path, current_file)}, {'key': 'OUTPUT', 'value': output_path}],
                            bind_mounts=bind_mounts,
                            no_cache=no_cache,
                            gpus=gpus,
                            docker_username=docker_username,
                            docker_password=docker_password,
                            docker=docker)

                        logger.info(f"Submitting container for file {i}")
                        futures.append(submit_command(client, command, options['log_file'] if 'log_file' in options else None, 3))

                    finished = 0
                    for future in tqdm.tqdm(as_completed(futures), total=num_files):
                        finished += 1
                        if future.status != 'finished':
                            logger.error(f"Container failed for file {finished}: {future.exception}")
                        else:
                            logger.info(f"Container completed for file {finished}")
        elif options['input']['kind'] == InputKind.FILE:
            input_path = options['input']['path']
            env = options['env'] if 'env' in options else []
            params = options['parameters'] if 'parameters' in options else []
            patterns = options['input']['patterns'] if 'patterns' in options['input'] else []
            bind_mounts = options['bind_mounts'] if 'bind_mounts' in options else []
            no_cache = options['no_cache'] if 'no_cache' in options else False
            gpus = options['gpus'] if 'gpus' in options else 0

            if 'jobqueue' in options: cluster.scale(1)
            with Client(cluster) as client:
                command = prep_command(
                    work_dir=options['workdir'],
                    image=options['image'],
                    command=options['command'],
                    env=env + [{'key': 'INDEX', 'value': 1}] + [{'key': 'PATTERNS', 'value': ','.join(patterns)}],
                    parameters=params + [{'key': 'INPUT', 'value': input_path}, {'key': 'OUTPUT', 'value': output_path}],
                    bind_mounts=bind_mounts,
                    no_cache=no_cache,
                    gpus=gpus,
                    docker_username=docker_username,
                    docker_password=docker_password,
                    docker=docker)

                logger.info(f"Submitting container for file 1")
                future = submit_command(client, command, options['log_file'] if 'log_file' in options else None, 3)
                future.result()
                if future.status != 'finished':
                    logger.error(f"Container failed for file 1")
                    logger.error(future.exception)
                else:
                    logger.info(f"Container completed for file 1")

        logger.info(f"Run succeeded")
    except:
        logger.error(f"Run failed: {traceback.format_exc()}")
        raise