Example #1
0
    def execute(pipeline_context, execution_plan):
        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)

        check.param_invariant(
            isinstance(pipeline_context.executor_config, CeleryConfig),
            'pipeline_context',
            'Expected executor_config to be CeleryConfig got {}'.format(
                pipeline_context.executor_config),
        )

        celery_config = pipeline_context.executor_config

        storage = pipeline_context.environment_dict.get('storage')

        if (celery_config.broker and not is_local_uri(celery_config.broker)
            ) or (celery_config.backend
                  and not is_local_uri(celery_config.backend)):
            check.invariant(
                storage.get('s3') or storage.get('gcs'),
                'Must use S3 or GCS storage with non-local Celery broker: {broker} '
                'and backend: {backend}'.format(broker=celery_config.broker,
                                                backend=celery_config.backend),
            )
        else:
            check.invariant(
                not storage.get('in_memory'),
                'Cannot use in-memory storage with Celery, use filesystem, S3, or GCS',
            )

        pipeline_name = pipeline_context.pipeline_def.name

        handle_dict = pipeline_context.execution_target_handle.to_dict()

        instance_ref_dict = pipeline_context.instance.get_ref().to_dict()

        environment_dict = dict(pipeline_context.environment_dict,
                                execution={'in_process': {}})

        mode = pipeline_context.mode_def.name

        run_id = pipeline_context.pipeline_run.run_id

        app = make_app(celery_config)

        task_signatures = {}  # Dict[step_key, celery.Signature]
        apply_kwargs = defaultdict(dict)  # Dict[step_key, Dict[str, Any]]

        priority_for_step = lambda step: (-1 * int(
            step.tags.get('dagster-celery/priority', task_default_priority)))
        priority_for_key = lambda step_key: (-1 * apply_kwargs[step_key][
            'priority'])
        _warn_on_priority_misuse(pipeline_context, execution_plan)

        for step_key in execution_plan.step_keys_to_execute:
            step = execution_plan.get_step_by_key(step_key)
            priority = int(
                step.tags.get('dagster-celery/priority',
                              task_default_priority))
            queue = step.tags.get('dagster-celery/queue', task_default_queue)
            task = create_task(app)

            variables = {
                'executionParams': {
                    'selector': {
                        'name': pipeline_name
                    },
                    'environmentConfigData': environment_dict,
                    'mode': mode,
                    'executionMetadata': {
                        'runId': run_id
                    },
                    'stepKeys': [step_key],
                }
            }
            task_signatures[step_key] = task.si(handle_dict, variables,
                                                instance_ref_dict)
            apply_kwargs[step_key] = {
                'priority': priority,
                'queue': queue,
                'routing_key': '{queue}.execute_query'.format(queue=queue),
            }

        step_results = {}  # Dict[ExecutionStep, celery.AsyncResult]
        step_success = {}
        step_errors = {}
        completed_steps = set({})  # Set[step_key]
        active_execution = execution_plan.start(sort_key_fn=priority_for_step)
        stopping = False

        while (not active_execution.is_complete
               and not stopping) or step_results:

            results_to_pop = []
            for step_key, result in sorted(
                    step_results.items(),
                    key=lambda x: priority_for_key(x[0])):
                if result.ready():
                    try:
                        step_events = result.get()
                    except Exception as e:  # pylint: disable=broad-except
                        # We will want to do more to handle the exception here.. maybe subclass Task
                        # Certainly yield an engine or pipeline event
                        step_events = []
                        step_errors[
                            step_key] = serializable_error_info_from_exc_info(
                                sys.exc_info())
                        stopping = True
                    for step_event in step_events:
                        event = deserialize_json_to_dagster_namedtuple(
                            step_event)
                        yield event
                        if event.is_step_success:
                            step_success[step_key] = True
                        elif event.is_step_failure:
                            step_success[step_key] = False

                    results_to_pop.append(step_key)
                    completed_steps.add(step_key)

            for step_key in results_to_pop:
                if step_key in step_results:
                    del step_results[step_key]
                    was_success = step_success.get(step_key)
                    if was_success == True:
                        active_execution.mark_success(step_key)
                    elif was_success == False:
                        active_execution.mark_failed(step_key)
                    else:
                        # check errors list?
                        pipeline_context.log.error(
                            'Step {key} finished without success or failure event, assuming failure.'
                            .format(key=step_key))
                        active_execution.mark_failed(step_key)

            # process skips from failures or uncovered inputs
            for event in active_execution.skipped_step_events_iterator(
                    pipeline_context):
                yield event

            # dont add any new steps if we are stopping
            if stopping:
                continue

            # This is a slight refinement. If we have n workers idle and schedule m > n steps for
            # execution, the first n steps will be picked up by the idle workers in the order in
            # which they are scheduled (and the following m-n steps will be executed in priority
            # order, provided that it takes longer to execute a step than to schedule it). The test
            # case has m >> n to exhibit this behavior in the absence of this sort step.
            for step in active_execution.get_steps_to_execute():
                try:
                    step_results[step.key] = task_signatures[
                        step.key].apply_async(**apply_kwargs[step.key])
                except Exception:
                    yield DagsterEvent.engine_event(
                        pipeline_context,
                        'Encountered error during celery task submission.'.
                        format(),
                        event_specific_data=EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(
                                sys.exc_info()), ),
                    )
                    raise

            time.sleep(TICK_SECONDS)

        if step_errors:
            raise DagsterSubprocessError(
                'During celery execution errors occured in workers:\n{error_list}'
                .format(error_list='\n'.join([
                    '[{step}]: {err}'.format(step=key, err=err.to_string())
                    for key, err in step_errors.items()
                ])),
                subprocess_error_infos=list(step_errors.values()),
            )
Example #2
0
    def execute(pipeline_context, execution_plan):
        from .tasks import make_app

        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)

        check.param_invariant(
            isinstance(pipeline_context.executor_config, CeleryConfig),
            'pipeline_context',
            'Expected executor_config to be CeleryConfig got {}'.format(
                pipeline_context.executor_config),
        )

        celery_config = pipeline_context.executor_config

        storage = pipeline_context.environment_dict.get('storage')

        if (celery_config.broker and not is_local_uri(celery_config.broker)
            ) or (celery_config.backend
                  and not is_local_uri(celery_config.backend)):
            check.invariant(
                storage.get('s3') or storage.get('gcs'),
                'Must use S3 or GCS storage with non-local Celery broker: {broker} '
                'and backend: {backend}'.format(broker=celery_config.broker,
                                                backend=celery_config.backend),
            )
        else:
            check.invariant(
                not storage.get('in_memory'),
                'Cannot use in-memory storage with Celery, use filesystem, S3, or GCS',
            )

        app = make_app(celery_config)

        priority_for_step = lambda step: (-1 * int(
            step.tags.get('dagster-celery/priority', task_default_priority)
        ) + -1 * _get_run_priority(pipeline_context))
        priority_for_key = lambda step_key: (priority_for_step(
            execution_plan.get_step_by_key(step_key)))
        _warn_on_priority_misuse(pipeline_context, execution_plan)

        step_results = {}  # Dict[ExecutionStep, celery.AsyncResult]
        step_errors = {}
        completed_steps = set({})  # Set[step_key]
        active_execution = execution_plan.start(
            retries=pipeline_context.executor_config.retries,
            sort_key_fn=priority_for_step)
        stopping = False

        while (not active_execution.is_complete
               and not stopping) or step_results:

            results_to_pop = []
            for step_key, result in sorted(
                    step_results.items(),
                    key=lambda x: priority_for_key(x[0])):
                if result.ready():
                    try:
                        step_events = result.get()
                    except Exception as e:  # pylint: disable=broad-except
                        # We will want to do more to handle the exception here.. maybe subclass Task
                        # Certainly yield an engine or pipeline event
                        step_events = []
                        step_errors[
                            step_key] = serializable_error_info_from_exc_info(
                                sys.exc_info())
                        stopping = True
                    for step_event in step_events:
                        event = deserialize_json_to_dagster_namedtuple(
                            step_event)
                        yield event
                        active_execution.handle_event(event)

                    results_to_pop.append(step_key)
                    completed_steps.add(step_key)

            for step_key in results_to_pop:
                if step_key in step_results:
                    del step_results[step_key]
                    active_execution.verify_complete(pipeline_context,
                                                     step_key)

            # process skips from failures or uncovered inputs
            for event in active_execution.skipped_step_events_iterator(
                    pipeline_context):
                yield event

            # don't add any new steps if we are stopping
            if stopping:
                continue

            # This is a slight refinement. If we have n workers idle and schedule m > n steps for
            # execution, the first n steps will be picked up by the idle workers in the order in
            # which they are scheduled (and the following m-n steps will be executed in priority
            # order, provided that it takes longer to execute a step than to schedule it). The test
            # case has m >> n to exhibit this behavior in the absence of this sort step.
            for step in active_execution.get_steps_to_execute():
                try:
                    queue = step.tags.get('dagster-celery/queue',
                                          task_default_queue)
                    yield DagsterEvent.engine_event(
                        pipeline_context,
                        'Submitting celery task for step "{step_key}" to queue "{queue}".'
                        .format(step_key=step.key, queue=queue),
                        EngineEventData(marker_start=DELEGATE_MARKER),
                        step_key=step.key,
                    )
                    step_results[step.key] = _submit_task(
                        app, pipeline_context, step, queue)
                except Exception:
                    yield DagsterEvent.engine_event(
                        pipeline_context,
                        'Encountered error during celery task submission.'.
                        format(),
                        event_specific_data=EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(
                                sys.exc_info()), ),
                    )
                    raise

            time.sleep(TICK_SECONDS)

        if step_errors:
            raise DagsterSubprocessError(
                'During celery execution errors occurred in workers:\n{error_list}'
                .format(error_list='\n'.join([
                    '[{step}]: {err}'.format(step=key, err=err.to_string())
                    for key, err in step_errors.items()
                ])),
                subprocess_error_infos=list(step_errors.values()),
            )
Example #3
0
def test_is_local_uri():
    assert is_local_uri('localhost')
    assert is_local_uri('localhost:80')
    assert is_local_uri('localhost:8080')
    assert is_local_uri('127.0.0.1')
    assert is_local_uri('127.0.0.1:80')
    assert is_local_uri('127.0.0.1:8080')
    assert is_local_uri('0.0.0.0')
    assert is_local_uri('0.0.0.0:80')
    assert is_local_uri('0.0.0.0:8080')

    assert is_local_uri('rpc://')

    assert not is_local_uri('elementl.com')
    assert not is_local_uri('192.0.0.1')
    assert not is_local_uri('http://elementl.com:8080')
    assert not is_local_uri('tcp://elementl.com')
    assert not is_local_uri('hdfs://mycluster.internal')

    assert not is_local_uri('bad.bad')
Example #4
0
    def execute(pipeline_context, execution_plan):
        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)

        dask_config = pipeline_context.executor_config

        check.param_invariant(
            isinstance(pipeline_context.executor_config, DaskConfig),
            'pipeline_context',
            'Expected executor_config to be DaskConfig got {}'.format(
                pipeline_context.executor_config),
        )

        # Checks to ensure storage is compatible with Dask configuration
        storage = pipeline_context.environment_dict.get('storage')
        check.invariant(storage.keys(),
                        'Must specify storage to use Dask execution')

        check.invariant(
            pipeline_context.instance.is_persistent,
            'Dask execution requires a persistent DagsterInstance',
        )

        # Remote
        if dask_config.address and not is_local_uri(dask_config.address):
            check.invariant(
                storage.get('s3') or storage.get('gcs'),
                'Must use S3 or GCS storage with non-local Dask address {dask_address}'
                .format(dask_address=dask_config.address),
            )
        # Local
        else:
            check.invariant(
                not storage.get('in_memory'),
                'Cannot use in-memory storage with Dask, use filesystem, S3, or GCS',
            )

        step_levels = execution_plan.execution_step_levels()

        pipeline_name = pipeline_context.pipeline_def.name

        instance = pipeline_context.instance

        with dask.distributed.Client(
                **dask_config.build_dict(pipeline_name)) as client:
            execution_futures = []
            execution_futures_dict = {}

            for step_level in step_levels:
                for step in step_level:
                    # We ensure correctness in sequencing by letting Dask schedule futures and
                    # awaiting dependencies within each step.
                    dependencies = []
                    for step_input in step.step_inputs:
                        for key in step_input.dependency_keys:
                            dependencies.append(execution_futures_dict[key])

                    environment_dict = dict(pipeline_context.environment_dict,
                                            execution={'in_process': {}})
                    variables = {
                        'executionParams': {
                            'selector': {
                                'name': pipeline_name
                            },
                            'environmentConfigData': environment_dict,
                            'mode': pipeline_context.mode_def.name,
                            'executionMetadata': {
                                'runId': pipeline_context.pipeline_run.run_id
                            },
                            'stepKeys': [step.key],
                        }
                    }

                    dask_task_name = '%s.%s' % (pipeline_name, step.key)

                    future = client.submit(
                        query_on_dask_worker,
                        pipeline_context.execution_target_handle,
                        variables,
                        dependencies,
                        instance.get_ref(),
                        key=dask_task_name,
                        resources=get_dask_resource_requirements(step.tags),
                    )

                    execution_futures.append(future)
                    execution_futures_dict[step.key] = future

            # This tells Dask to awaits the step executions and retrieve their results to the
            # master
            for future in dask.distributed.as_completed(execution_futures):
                for step_event in future.result():
                    check.inst(step_event, DagsterEvent)

                    yield step_event
Example #5
0
def test_is_local_uri():
    assert is_local_uri("localhost")
    assert is_local_uri("localhost:80")
    assert is_local_uri("localhost:8080")
    assert is_local_uri("127.0.0.1")
    assert is_local_uri("127.0.0.1:80")
    assert is_local_uri("127.0.0.1:8080")
    assert is_local_uri("0.0.0.0")
    assert is_local_uri("0.0.0.0:80")
    assert is_local_uri("0.0.0.0:8080")

    assert is_local_uri("rpc://")

    assert not is_local_uri("elementl.com")
    assert not is_local_uri("192.0.0.1")
    assert not is_local_uri("http://elementl.com:8080")
    assert not is_local_uri("tcp://elementl.com")
    assert not is_local_uri("hdfs://mycluster.internal")

    assert not is_local_uri("bad.bad")