def _dagster_compute_log_manager(base_dir): config = dagster_instance_config(base_dir) compute_log_base = os.path.join(base_dir, 'storage') if config and config.get('compute_logs'): if 'module' in config['compute_logs'] and 'class' in config[ 'compute_logs']: from dagster.core.storage.compute_log_manager import ComputeLogManager try: module = __import__(config['compute_logs']['module']) klass = getattr(module, config['compute_logs']['class']) check.subclass_param(klass, 'compute_log_manager', ComputeLogManager) kwargs = config['compute_logs'].get('config', {}) compute_log_manager = klass(compute_log_base, **kwargs) check.inst_param(compute_log_manager, 'compute_log_manager', ComputeLogManager) return compute_log_manager except Exception: raise DagsterInvariantViolationError( 'Invalid dagster config in `{config_yaml_filename}`. Expecting `module`, ' '`class`, and `config`, returning a valid instance of ' '`ComputeLogManager`'.format( config_yaml_filename=DAGSTER_CONFIG_YAML_FILENAME)) from dagster.core.storage.local_compute_log_manager import LocalComputeLogManager return LocalComputeLogManager(compute_log_base)
def test_subclass_param(): class Super(object): pass class Sub(Super): pass class Alone(object): pass assert check.subclass_param(Sub, "foo", Super) with pytest.raises(CheckError): assert check.subclass_param(Alone, "foo", Super) with pytest.raises(CheckError): assert check.subclass_param("value", "foo", Super) assert check.opt_subclass_param(Sub, "foo", Super) assert check.opt_subclass_param(None, "foo", Super) is None with pytest.raises(CheckError): assert check.opt_subclass_param(Alone, "foo", Super) with pytest.raises(CheckError): assert check.opt_subclass_param("value", "foo", Super)
def user_code_error_boundary(error_cls, msg, **kwargs): ''' Wraps the execution of user-space code in an error boundary. This places a uniform policy around an user code invoked by the framework. This ensures that all user errors are wrapped in the DagsterUserCodeExecutionError, and that the original stack trace of the user error is preserved, so that it can be reported without confusing framework code in the stack trace, if a tool author wishes to do so. This has been especially help in a notebooking context. ''' check.str_param(msg, 'msg') check.subclass_param(error_cls, 'error_cls', DagsterUserCodeExecutionError) try: yield except Exception as e: # pylint: disable=W0703 if isinstance(e, DagsterError): # The system has thrown an error that is part of the user-framework contract raise e else: # An exception has been thrown by user code and computation should cease # with the error reported further up the stack raise_from( error_cls(msg, user_exception=e, original_exc_info=sys.exc_info(), **kwargs), e)
def user_code_context_manager(user_fn, error_cls, msg): '''Wraps the output of a user provided function that may yield or return a value and returns a generator that asserts it only yields a single value. ''' check.callable_param(user_fn, 'user_fn') check.subclass_param(error_cls, 'error_cls', DagsterUserCodeExecutionError) with user_code_error_boundary(error_cls, msg): thing_or_gen = user_fn() gen = _ensure_gen(thing_or_gen) try: thing = next(gen) except StopIteration: check.failed('Must yield one item. You did not yield anything.') yield thing stopped = False try: next(gen) except StopIteration: stopped = True check.invariant(stopped, 'Must yield one item. Yielded more than one item')
def make_airflow_dag_for_operator( recon_repo, pipeline_name, operator, run_config=None, mode=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None, ): """Construct an Airflow DAG corresponding to a given Dagster pipeline and custom operator. `Custom operator template <https://github.com/dagster-io/dagster/blob/master/python_modules/dagster-test/dagster_test/dagster_airflow/custom_operator.py>`_ Tasks in the resulting DAG will execute the Dagster logic they encapsulate run by the given Operator :py:class:`BaseOperator <airflow.models.BaseOperator>`. If you are looking for a containerized solution to provide better isolation, see instead :py:func:`make_airflow_dag_containerized`. This function should be invoked in an Airflow DAG definition file, such as that created by an invocation of the dagster-airflow scaffold CLI tool. Args: recon_repo (:class:`dagster.ReconstructableRepository`): reference to a Dagster RepositoryDefinition that can be reconstructed in another process pipeline_name (str): The name of the pipeline definition. operator (type): The operator to use. Must be a class that inherits from :py:class:`BaseOperator <airflow.models.BaseOperator>` run_config (Optional[dict]): The environment config, if any, with which to compile the pipeline to an execution plan, as a Python dict. mode (Optional[str]): The mode in which to execute the pipeline. instance (Optional[DagsterInstance]): The Dagster instance to use to execute the pipeline. dag_id (Optional[str]): The id to use for the compiled Airflow DAG (passed through to :py:class:`DAG <airflow:airflow.models.DAG>`). dag_description (Optional[str]): The description to use for the compiled Airflow DAG (passed through to :py:class:`DAG <airflow:airflow.models.DAG>`) dag_kwargs (Optional[dict]): Any additional kwargs to pass to the Airflow :py:class:`DAG <airflow:airflow.models.DAG>` constructor, including ``default_args``. op_kwargs (Optional[dict]): Any additional kwargs to pass to the underlying Airflow operator. Returns: (airflow.models.DAG, List[airflow.models.BaseOperator]): The generated Airflow DAG, and a list of its constituent tasks. """ check.subclass_param(operator, "operator", BaseOperator) return _make_airflow_dag( recon_repo=recon_repo, pipeline_name=pipeline_name, run_config=run_config, mode=mode, dag_id=dag_id, dag_description=dag_description, dag_kwargs=dag_kwargs, op_kwargs=op_kwargs, operator=operator, )
def register_type(self, type_to_register, type_storage_plugin): check.inst_param(type_to_register, 'type_to_register', RuntimeType) check.subclass_param(type_storage_plugin, 'type_storage_plugin', TypeStoragePlugin) check.invariant( type_to_register.name is not None, 'Cannot register a type storage plugin for an anonymous type', ) self.TYPE_STORAGE_PLUGIN_REGISTRY[type_to_register.name] = type_storage_plugin
def register_type(self, type_to_register, type_storage_plugin): from dagster.core.types.dagster_type import DagsterType check.inst_param(type_to_register, 'type_to_register', DagsterType) check.subclass_param(type_storage_plugin, 'type_storage_plugin', TypeStoragePlugin) check.invariant( type_to_register.name is not None, 'Cannot register a type storage plugin for an anonymous type', ) self._registry[type_to_register.name] = type_storage_plugin
def user_code_error_boundary(error_cls, msg_fn, control_flow_exceptions=None, **kwargs): """ Wraps the execution of user-space code in an error boundary. This places a uniform policy around an user code invoked by the framework. This ensures that all user errors are wrapped in an exception derived from DagsterUserCodeExecutionError, and that the original stack trace of the user error is preserved, so that it can be reported without confusing framework code in the stack trace, if a tool author wishes to do so. This has been especially help in a notebooking context. Examples: .. code-block:: python with user_code_error_boundary( # Pass a class that inherits from DagsterUserCodeExecutionError DagstermillExecutionError, # Pass a function that produces a message lambda: 'Error occurred during the execution of Dagstermill solid ' '{solid_name}: {notebook_path}'.format( solid_name=name, notebook_path=notebook_path ), ): call_user_provided_function() """ check.callable_param(msg_fn, "msg_fn") check.subclass_param(error_cls, "error_cls", DagsterUserCodeExecutionError) control_flow_exceptions = tuple( check.opt_list_param(control_flow_exceptions, "control_flow_exceptions")) with raise_execution_interrupts(): try: yield except control_flow_exceptions as cf: # A control flow exception has occurred and should be propagated raise cf except DagsterError as de: # The system has thrown an error that is part of the user-framework contract raise de except Exception as e: # pylint: disable=W0703 # An exception has been thrown by user code and computation should cease # with the error reported further up the stack raise_from( error_cls(msg_fn(), user_exception=e, original_exc_info=sys.exc_info(), **kwargs), e)
def __init__(self, scheduler_type, schedule_defs, artifacts_dir, repository_name): from .storage import FilesystemScheduleStorage check.subclass_param(scheduler_type, 'scheduler_type', Scheduler) check.list_param(schedule_defs, 'schedule_defs', ScheduleDefinition) check.str_param(artifacts_dir, 'artifacts_dir') check.str_param(repository_name, 'repository_name') self._Scheduler = scheduler_type self._artifacts_dir = artifacts_dir self._schedule_defs = schedule_defs self._schedule_storage = FilesystemScheduleStorage( artifacts_dir, repository_name=repository_name)
def whitelist_for_serdes(serializer: Union[Type, Type["Serializer"]]): """ Decorator to whitelist a named tuple or enum to be serializable. @whitelist_for_serdes class """ if inspect.isclass(serializer) and not issubclass(serializer, Serializer): return _whitelist_for_serdes(whitelist_map=_WHITELIST_MAP, serializer=None)(serializer) else: check.subclass_param(serializer, "serializer", Serializer) serializer = cast(Type[Serializer], serializer) return _whitelist_for_serdes(whitelist_map=_WHITELIST_MAP, serializer=serializer)
def test_subclass_param(): class Super(object): pass class Sub(Super): pass class Alone(object): pass assert check.subclass_param(Sub, 'foo', Super) with pytest.raises(CheckError): assert check.subclass_param(Alone, 'foo', Super) with pytest.raises(CheckError): assert check.subclass_param('value', 'foo', Super)
def user_code_error_boundary(error_cls, msg_fn, log_manager=None, **kwargs): """ Wraps the execution of user-space code in an error boundary. This places a uniform policy around any user code invoked by the framework. This ensures that all user errors are wrapped in an exception derived from DagsterUserCodeExecutionError, and that the original stack trace of the user error is preserved, so that it can be reported without confusing framework code in the stack trace, if a tool author wishes to do so. Examples: .. code-block:: python with user_code_error_boundary( # Pass a class that inherits from DagsterUserCodeExecutionError DagsterExecutionStepExecutionError, # Pass a function that produces a message "Error occurred during step execution" ): call_user_provided_function() """ check.callable_param(msg_fn, "msg_fn") check.subclass_param(error_cls, "error_cls", DagsterUserCodeExecutionError) with raise_execution_interrupts(): if log_manager: log_manager.begin_python_log_capture() try: yield except DagsterError as de: # The system has thrown an error that is part of the user-framework contract raise de except Exception as e: # pylint: disable=W0703 # An exception has been thrown by user code and computation should cease # with the error reported further up the stack raise error_cls(msg_fn(), user_exception=e, original_exc_info=sys.exc_info(), **kwargs) from e finally: if log_manager: log_manager.end_python_log_capture()
def rehydrate(self, **constructor_kwargs): from dagster.core.errors import DagsterInvalidConfigError from dagster.core.types.evaluator import evaluate_config module = importlib.import_module(self.module_name) klass = getattr(module, self.class_name) check.subclass_param( klass, 'class {class_name} in module {module_name}'.format( class_name=self.class_name, module_name=self.module_name), ConfigurableClass, ) config_dict = yaml.load(self.config_yaml) result = evaluate_config(klass.config_type().inst(), config_dict) if not result.success: raise DagsterInvalidConfigError(None, result.errors, config_dict) constructor_kwargs['inst_data'] = self return klass.from_config_value(result.value, **constructor_kwargs)
def whitelist_for_serdes(__cls: Optional[Type] = None, *, serializer: Optional[Type["Serializer"]] = None): """ Decorator to whitelist a named tuple or enum to be serializable. @whitelist_for_serdes class """ if __cls is not None: # decorator invoked directly on class check.class_param(__cls, "__cls") return _whitelist_for_serdes(whitelist_map=_WHITELIST_MAP, serializer=None)(__cls) else: # decorator passed params check.subclass_param(serializer, "serializer", Serializer) serializer = cast(Type[Serializer], serializer) return _whitelist_for_serdes(whitelist_map=_WHITELIST_MAP, serializer=serializer)
def rehydrate(self, **constructor_kwargs): from dagster.core.errors import DagsterInvalidConfigError from dagster.core.types.evaluator import evaluate_config try: module = importlib.import_module(self.module_name) except seven.ModuleNotFoundError: check.invariant( False, 'Couldn\'t import module {module_name} when attempting to rehydrate the ' 'configurable class {configurable_class}'.format( module_name=self.module_name, configurable_class=self.module_name + '.' + self.class_name, ), ) try: klass = getattr(module, self.class_name) except AttributeError: check.invariant( False, 'Couldn\'t find class {class_name} in module when attempting to rehydrate the ' 'configurable class {configurable_class}'.format( class_name=self.class_name, configurable_class=self.module_name + '.' + self.class_name, ), ) check.subclass_param( klass, 'class {class_name} in module {module_name}'.format( class_name=self.class_name, module_name=self.module_name), ConfigurableClass, ) config_dict = yaml.load(self.config_yaml) result = evaluate_config(klass.config_type().inst(), config_dict) if not result.success: raise DagsterInvalidConfigError(None, result.errors, config_dict) constructor_kwargs['inst_data'] = self return klass.from_config_value(result.value, **constructor_kwargs)
def solid_execution_error_boundary(error_cls, msg_fn, step_context, **kwargs): """ A specialization of user_code_error_boundary for the steps involved in executing a solid. This variant supports the control flow exceptions RetryRequested and Failure as well as respecting the RetryPolicy if present. """ from dagster.core.execution.context.system import StepExecutionContext check.callable_param(msg_fn, "msg_fn") check.subclass_param(error_cls, "error_cls", DagsterUserCodeExecutionError) check.inst_param(step_context, "step_context", StepExecutionContext) with raise_execution_interrupts(): try: yield except (RetryRequested, Failure) as cf: # A control flow exception has occurred and should be propagated raise cf except DagsterError as de: # The system has thrown an error that is part of the user-framework contract raise de except Exception as e: # pylint: disable=W0703 # An exception has been thrown by user code and computation should cease # with the error reported further up the stack policy = step_context.solid_retry_policy if policy: # could check exc against a whitelist of exceptions raise RetryRequested( max_retries=policy.max_retries, # could support an enum of "delay curves" which use delay and # step_context.previous_attempt_count to calculate wait time seconds_to_wait=policy.delay, ) from e raise error_cls( msg_fn(), user_exception=e, original_exc_info=sys.exc_info(), **kwargs, ) from e
def _make_airflow_dag( handle, pipeline_name, environment_dict=None, mode=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None, operator=DagsterPythonOperator, ): check.inst_param(handle, 'handle', ExecutionTargetHandle) environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str) mode = check.opt_str_param(mode, 'mode') # Only used for Airflow; internally we continue to use pipeline.name dag_id = check.opt_str_param(dag_id, 'dag_id', _rename_for_airflow(pipeline_name)) dag_description = check.opt_str_param(dag_description, 'dag_description', _make_dag_description(pipeline_name)) check.subclass_param(operator, 'operator', DagsterOperator) # black 18.9b0 doesn't support py27-compatible formatting of the below invocation (omitting # the trailing comma after **check.opt_dict_param...) -- black 19.3b0 supports multiple python # versions, but currently doesn't know what to do with from __future__ import print_function -- # see https://github.com/ambv/black/issues/768 # fmt: off dag_kwargs = dict({'default_args': DEFAULT_ARGS}, **check.opt_dict_param(dag_kwargs, 'dag_kwargs', key_type=str)) # fmt: on op_kwargs = check.opt_dict_param(op_kwargs, 'op_kwargs', key_type=str) dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs) pipeline = handle.build_pipeline_definition() if mode is None: mode = pipeline.get_default_mode_name() execution_plan = create_execution_plan(pipeline, environment_dict, mode=mode) tasks = {} coalesced_plan = coalesce_execution_steps(execution_plan) for solid_name, solid_steps in coalesced_plan.items(): step_keys = [step.key for step in solid_steps] task = operator.operator_for_solid( handle=handle, pipeline_name=pipeline_name, environment_dict=environment_dict, mode=mode, solid_name=solid_name, step_keys=step_keys, dag=dag, dag_id=dag_id, op_kwargs=op_kwargs, ) tasks[solid_name] = task for solid_step in solid_steps: for step_input in solid_step.step_inputs: prev_solid_name = execution_plan.get_step_by_key( step_input.prev_output_handle.step_key).solid_name if solid_name != prev_solid_name: tasks[prev_solid_name].set_downstream(task) return (dag, [tasks[solid_name] for solid_name in coalesced_plan.keys()])
def __init__(self, scheduler_type): self.scheduler_type = check.subclass_param(scheduler_type, 'scheduler_type', Scheduler)
def solid_execution_error_boundary(error_cls, msg_fn, step_context, **kwargs): """ A specialization of user_code_error_boundary for the steps involved in executing a solid. This variant supports the control flow exceptions RetryRequested and Failure as well as respecting the RetryPolicy if present. """ from dagster.core.execution.context.system import StepExecutionContext check.callable_param(msg_fn, "msg_fn") check.subclass_param(error_cls, "error_cls", DagsterUserCodeExecutionError) check.inst_param(step_context, "step_context", StepExecutionContext) with raise_execution_interrupts(): step_context.log.begin_python_log_capture() retry_policy = step_context.solid_retry_policy try: yield except DagsterError as de: # The system has thrown an error that is part of the user-framework contract raise de except Exception as e: # pylint: disable=W0703 # An exception has been thrown by user code and computation should cease # with the error reported further up the stack # Directly thrown RetryRequested escalate before evaluating the retry policy. if isinstance(e, RetryRequested): raise e if retry_policy: raise RetryRequested( max_retries=retry_policy.max_retries, seconds_to_wait=retry_policy.calculate_delay( step_context.previous_attempt_count + 1), ) from e # Failure exceptions get re-throw without wrapping if isinstance(e, Failure): raise e # Otherwise wrap the user exception with context raise error_cls( msg_fn(), user_exception=e, original_exc_info=sys.exc_info(), **kwargs, ) from e except (DagsterExecutionInterruptedError, KeyboardInterrupt) as ie: # respect retry policy when interrupts occur if retry_policy: raise RetryRequested( max_retries=retry_policy.max_retries, seconds_to_wait=retry_policy.calculate_delay( step_context.previous_attempt_count + 1), ) from ie else: raise ie finally: step_context.log.end_python_log_capture()
def _make_airflow_dag( recon_repo, pipeline_name, run_config=None, mode=None, instance=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None, operator=DagsterPythonOperator, ): check.inst_param(recon_repo, "recon_repo", ReconstructableRepository) check.str_param(pipeline_name, "pipeline_name") run_config = check.opt_dict_param(run_config, "run_config", key_type=str) mode = check.opt_str_param(mode, "mode") # Default to use the (persistent) system temp directory rather than a seven.TemporaryDirectory, # which would not be consistent between Airflow task invocations. instance = ( check.inst_param(instance, "instance", DagsterInstance) if instance else DagsterInstance.get(fallback_storage=seven.get_system_temp_directory()) ) # Only used for Airflow; internally we continue to use pipeline.name dag_id = check.opt_str_param(dag_id, "dag_id", _rename_for_airflow(pipeline_name)) dag_description = check.opt_str_param( dag_description, "dag_description", _make_dag_description(pipeline_name) ) check.subclass_param(operator, "operator", BaseOperator) dag_kwargs = dict( {"default_args": DEFAULT_ARGS}, **check.opt_dict_param(dag_kwargs, "dag_kwargs", key_type=str) ) op_kwargs = check.opt_dict_param(op_kwargs, "op_kwargs", key_type=str) dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs) pipeline = recon_repo.get_definition().get_pipeline(pipeline_name) if mode is None: mode = pipeline.get_default_mode_name() execution_plan = create_execution_plan(pipeline, run_config, mode=mode) tasks = {} coalesced_plan = coalesce_execution_steps(execution_plan) for solid_handle, solid_steps in coalesced_plan.items(): step_keys = [step.key for step in solid_steps] operator_parameters = DagsterOperatorParameters( recon_repo=recon_repo, pipeline_name=pipeline_name, run_config=run_config, mode=mode, task_id=solid_handle, step_keys=step_keys, dag=dag, instance_ref=instance.get_ref(), op_kwargs=op_kwargs, pipeline_snapshot=pipeline.get_pipeline_snapshot(), execution_plan_snapshot=snapshot_from_execution_plan( execution_plan, pipeline_snapshot_id=pipeline.get_pipeline_snapshot_id() ), ) task = operator(operator_parameters) tasks[solid_handle] = task for solid_step in solid_steps: for step_input in solid_step.step_inputs: for key in step_input.dependency_keys: prev_solid_handle = execution_plan.get_step_by_key(key).solid_handle.to_string() if solid_handle != prev_solid_handle: tasks[prev_solid_handle].set_downstream(task) return (dag, [tasks[solid_handle] for solid_handle in coalesced_plan.keys()])
def __whitelist_for_persistence(klass): check.subclass_param(klass, "klass", Persistable) whitelist_map["persistence"][klass.__name__] = klass return klass
def get_object(cls, object_store, context, runtime_type, paths): check.subclass_param(object_store, 'object_store', ObjectStore) return object_store.get_object(context, runtime_type, paths)
def get_object(cls, intermediate_store, context, runtime_type, paths): from .intermediate_store import IntermediateStore check.subclass_param(intermediate_store, 'intermediate_store', IntermediateStore) return intermediate_store.get_object(context, runtime_type, paths)
def _make_airflow_dag( handle, pipeline_name, environment_dict=None, mode=None, instance=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None, operator=DagsterPythonOperator, ): check.inst_param(handle, 'handle', ExecutionTargetHandle) check.str_param(pipeline_name, 'pipeline_name') environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str) mode = check.opt_str_param(mode, 'mode') # Default to use the (persistent) system temp directory rather than a seven.TemporaryDirectory, # which would not be consistent between Airflow task invocations. instance = (check.inst_param(instance, 'instance', DagsterInstance) if instance else DagsterInstance.get( fallback_storage=seven.get_system_temp_directory())) # Only used for Airflow; internally we continue to use pipeline.name dag_id = check.opt_str_param(dag_id, 'dag_id', _rename_for_airflow(pipeline_name)) dag_description = check.opt_str_param(dag_description, 'dag_description', _make_dag_description(pipeline_name)) check.subclass_param(operator, 'operator', BaseOperator) # black 18.9b0 doesn't support py27-compatible formatting of the below invocation (omitting # the trailing comma after **check.opt_dict_param...) -- black 19.3b0 supports multiple python # versions, but currently doesn't know what to do with from __future__ import print_function -- # see https://github.com/ambv/black/issues/768 # fmt: off dag_kwargs = dict({'default_args': DEFAULT_ARGS}, **check.opt_dict_param(dag_kwargs, 'dag_kwargs', key_type=str)) # fmt: on op_kwargs = check.opt_dict_param(op_kwargs, 'op_kwargs', key_type=str) dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs) pipeline = handle.build_pipeline_definition() if mode is None: mode = pipeline.get_default_mode_name() execution_plan = create_execution_plan(pipeline, environment_dict, run_config=RunConfig(mode=mode)) tasks = {} coalesced_plan = coalesce_execution_steps(execution_plan) for solid_handle, solid_steps in coalesced_plan.items(): step_keys = [step.key for step in solid_steps] # We separately construct the Airflow operators here with the appropriate args, because if # Airflow gets extraneous args/kwargs it emits a warning every time it parses the DAG (and # future Airflow versions will mark this a failure). # see https://github.com/ambv/black/issues/768 # fmt: off if operator == DagsterPythonOperator: task = operator(handle=handle, pipeline_name=pipeline_name, environment_dict=environment_dict, mode=mode, task_id=solid_handle, step_keys=step_keys, dag=dag, instance_ref=instance.get_ref(), **op_kwargs) else: task = operator(pipeline_name=pipeline_name, environment_dict=environment_dict, mode=mode, task_id=solid_handle, step_keys=step_keys, dag=dag, instance_ref=instance.get_ref(), **op_kwargs) # fmt: on tasks[solid_handle] = task for solid_step in solid_steps: for step_input in solid_step.step_inputs: for key in step_input.dependency_keys: prev_solid_handle = execution_plan.get_step_by_key( key).solid_handle.to_string() if solid_handle != prev_solid_handle: tasks[prev_solid_handle].set_downstream(task) return (dag, [tasks[solid_handle] for solid_handle in coalesced_plan.keys()])
def _make_airflow_dag( handle, pipeline_name, environment_dict=None, mode=None, instance=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None, operator=DagsterPythonOperator, ): check.inst_param(handle, 'handle', ExecutionTargetHandle) check.str_param(pipeline_name, 'pipeline_name') environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str) mode = check.opt_str_param(mode, 'mode') # Default to use the (persistent) system temp directory rather than a seven.TemporaryDirectory, # which would not be consistent between Airflow task invocations. instance = (check.inst_param(instance, 'instance', DagsterInstance) if instance else DagsterInstance.get( fallback_storage=seven.get_system_temp_directory())) # Only used for Airflow; internally we continue to use pipeline.name dag_id = check.opt_str_param(dag_id, 'dag_id', _rename_for_airflow(pipeline_name)) dag_description = check.opt_str_param(dag_description, 'dag_description', _make_dag_description(pipeline_name)) check.subclass_param(operator, 'operator', BaseOperator) dag_kwargs = dict({'default_args': DEFAULT_ARGS}, **check.opt_dict_param(dag_kwargs, 'dag_kwargs', key_type=str)) op_kwargs = check.opt_dict_param(op_kwargs, 'op_kwargs', key_type=str) dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs) pipeline = handle.build_pipeline_definition() if mode is None: mode = pipeline.get_default_mode_name() execution_plan = create_execution_plan(pipeline, environment_dict, run_config=RunConfig(mode=mode)) tasks = {} coalesced_plan = coalesce_execution_steps(execution_plan) for solid_handle, solid_steps in coalesced_plan.items(): step_keys = [step.key for step in solid_steps] operator_parameters = DagsterOperatorParameters( handle=handle, pipeline_name=pipeline_name, environment_dict=environment_dict, mode=mode, task_id=solid_handle, step_keys=step_keys, dag=dag, instance_ref=instance.get_ref(), op_kwargs=op_kwargs, ) task = operator(operator_parameters) tasks[solid_handle] = task for solid_step in solid_steps: for step_input in solid_step.step_inputs: for key in step_input.dependency_keys: prev_solid_handle = execution_plan.get_step_by_key( key).solid_handle.to_string() if solid_handle != prev_solid_handle: tasks[prev_solid_handle].set_downstream(task) return (dag, [tasks[solid_handle] for solid_handle in coalesced_plan.keys()])
def _make_airflow_dag( pipeline, env_config=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None, operator=DagsterPythonOperator, ): check.inst_param(pipeline, 'pipeline', PipelineDefinition) env_config = check.opt_dict_param(env_config, 'env_config', key_type=str) pipeline_name = pipeline.name dag_id = check.opt_str_param(dag_id, 'dag_id', pipeline.name) dag_description = check.opt_str_param( dag_description, 'dag_description', _make_dag_description(pipeline_name, env_config)) check.subclass_param(operator, 'operator', DagsterOperator) # black 18.9b0 doesn't support py27-compatible formatting of the below invocation (omitting # the trailing comma after **check.opt_dict_param...) -- black 19.3b0 supports multiple python # versions, but currently doesn't know what to do with from __future__ import print_function -- # see https://github.com/ambv/black/issues/768 # fmt: off dag_kwargs = dict({'default_args': DEFAULT_ARGS}, **check.opt_dict_param(dag_kwargs, 'dag_kwargs', key_type=str)) # fmt: on op_kwargs = check.opt_dict_param(op_kwargs, 'op_kwargs', key_type=str) dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs) execution_plan = create_execution_plan(pipeline, env_config) tasks = {} coalesced_plan = coalesce_execution_steps(execution_plan) for solid_name, solid_steps in coalesced_plan.items(): step_keys = [step.key for step in solid_steps] task = operator.operator_for_solid( pipeline=pipeline, env_config=env_config, solid_name=solid_name, step_keys=step_keys, dag=dag, dag_id=dag_id, op_kwargs=op_kwargs, ) tasks[solid_name] = task for solid_step in solid_steps: for step_input in solid_step.step_inputs: prev_solid_name = execution_plan.get_step_by_key( step_input.prev_output_handle.step_key).tags['solid'] if solid_name != prev_solid_name: tasks[prev_solid_name].set_downstream(task) return (dag, [tasks[solid_name] for solid_name in coalesced_plan.keys()])
def __whitelist_for_persistence(klass): check.subclass_param(klass, 'klass', Persistable) whitelist_map['persistence'][klass.__name__] = klass return klass