Example #1
0
def merge_yamls(file_list):
    check.list_param(file_list, 'file_list', of_type=str)
    merged = {}
    for yaml_file in file_list:
        merged = dict_merge(load_yaml_from_path(yaml_file) or {}, merged)
    return merged
Example #2
0
 def __init__(self, *args, **kwargs):
     self.step_keys = check.list_param(kwargs.pop('step_keys'), 'step_keys',
                                       str)
     super(DagsterExecutionStepNotFoundError,
           self).__init__(*args, **kwargs)
Example #3
0
    def __init__(
            self,
            solid_defs,
            name=None,
            description=None,
            dependencies=None,
            mode_defs=None,
            preset_defs=None,
            _parent_pipeline_def=None,  # https://github.com/dagster-io/dagster/issues/2115
    ):
        self._name = check.opt_str_param(name, 'name', '<<unnamed>>')
        self._description = check.opt_str_param(description, 'description')

        mode_definitions = check.opt_list_param(mode_defs,
                                                'mode_defs',
                                                of_type=ModeDefinition)

        if not mode_definitions:
            mode_definitions = [ModeDefinition()]

        self._mode_definitions = mode_definitions

        self._current_level_solid_defs = check.list_param(
            _check_solids_arg(self._name, solid_defs),
            'solid_defs',
            of_type=ISolidDefinition)

        seen_modes = set()
        for mode_def in mode_definitions:
            if mode_def.name in seen_modes:
                raise DagsterInvalidDefinitionError((
                    'Two modes seen with the name "{mode_name}" in "{pipeline_name}". '
                    'Modes must have unique names.').format(
                        mode_name=mode_def.name, pipeline_name=self._name))
            seen_modes.add(mode_def.name)

        self._dependencies = validate_dependency_dict(dependencies)

        dependency_structure, solid_dict = create_execution_structure(
            self._current_level_solid_defs,
            self._dependencies,
            container_definition=None)

        self._solid_dict = solid_dict
        self._dependency_structure = dependency_structure

        self._runtime_type_dict = construct_dagster_type_dictionary(
            self._current_level_solid_defs)

        self._preset_defs = check.opt_list_param(preset_defs, 'preset_defs',
                                                 PresetDefinition)
        self._preset_dict = {}
        for preset in self._preset_defs:
            if preset.name in self._preset_dict:
                raise DagsterInvalidDefinitionError((
                    'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". '
                    'PresetDefinitions must have unique names.').format(
                        name=preset.name, pipeline_name=self._name))
            if preset.mode not in seen_modes:
                raise DagsterInvalidDefinitionError(
                    ('PresetDefinition "{name}" in "{pipeline_name}" '
                     'references mode "{mode}" which is not defined.').format(
                         name=preset.name,
                         pipeline_name=self._name,
                         mode=preset.mode))
            self._preset_dict[preset.name] = preset

        # Validate solid resource dependencies
        _validate_resource_dependencies(self._mode_definitions,
                                        self._current_level_solid_defs)

        # Validate unsatisfied inputs can be materialized from config
        _validate_inputs(self._dependency_structure, self._solid_dict)

        self._all_solid_defs = _build_all_solid_defs(
            self._current_level_solid_defs)
        self._parent_pipeline_def = check.opt_inst_param(
            _parent_pipeline_def, '_parent_pipeline_def', PipelineDefinition)
        self._cached_enviroment_schemas = {}
Example #4
0
def await_pg_notifications(
    conn_string,
    channels=None,
    timeout=5.0,
    yield_on_timeout=False,
    handle_signals=None,
    exit_event=None,
):
    """Subscribe to PostgreSQL notifications, and handle them
    in infinite-loop style.
    On an actual message, returns the notification (with .pid,
    .channel, and .payload attributes).
    If you've enabled 'yield_on_timeout', yields None on timeout.
    If you've enabled 'handle_keyboardinterrupt', yields False on
    interrupt.
    """

    check.str_param(conn_string, 'conn_string')
    channels = None if channels is None else check.list_param(channels, 'channels', of_type=str)
    check.float_param(timeout, 'timeout')
    check.bool_param(yield_on_timeout, 'yield_on_timeout')

    conn = get_conn(conn_string)

    if channels:
        start_listening(conn, channels)

    signals_to_handle = handle_signals or []
    original_handlers = {}

    try:
        if signals_to_handle:
            original_handlers = {s: signal.signal(s, _empty_handler) for s in signals_to_handle}
            wakeup = get_wakeup_fd()
            listen_on = [conn, wakeup]
        else:
            listen_on = [conn]
            wakeup = None

        while True and not (exit_event and exit_event.is_set()):
            try:
                r, w, x = select.select(listen_on, [], [], max(0, timeout))
                if (r, w, x) == ([], [], []):
                    if yield_on_timeout:
                        yield None

                if wakeup is not None and wakeup in r:
                    signal_byte = os.read(wakeup, 1)
                    signal_int = int.from_bytes(signal_byte, sys.byteorder)
                    yield signal_int

                if conn in r:
                    conn.poll()

                    notify_list = []
                    while conn.notifies:
                        notify_list.append(conn.notifies.pop())

                    for notif in notify_list:
                        yield notif

            except select.error as e:
                e_num, _e_message = e  # pylint: disable=unpacking-non-sequence
                if e_num == errno.EINTR:
                    pass
                else:
                    raise
    finally:
        conn.close()
        for s in signals_to_handle or []:
            if s in original_handlers:
                # Commenting out to get pylint to pass
                # https://github.com/dagster-io/dagster/issues/2510
                # signal_name = construct_signals(s).name
                signal.signal(s, original_handlers[s])
 def __init__(self, manager_fn, marks):
     self.manager_fn = check.callable_param(manager_fn, 'manager_fn')
     self.marks = check.list_param(marks, 'marks')
Example #6
0
def validate_solid_fn(
    decorator_name, fn_name, compute_fn, input_defs, expected_positionals=None, exclude_nothing=True
):
    check.str_param(decorator_name, 'decorator_name')
    check.str_param(fn_name, 'fn_name')
    check.callable_param(compute_fn, 'compute_fn')
    check.list_param(input_defs, 'input_defs', of_type=InputDefinition)
    expected_positionals = check.opt_list_param(
        expected_positionals, 'expected_positionals', of_type=str
    )
    if exclude_nothing:
        names = set(
            inp.name for inp in input_defs if not inp.dagster_type.kind == DagsterTypeKind.NOTHING
        )
        nothing_names = set(
            inp.name for inp in input_defs if inp.dagster_type.kind == DagsterTypeKind.NOTHING
        )
    else:
        names = set(inp.name for inp in input_defs)
        nothing_names = set()

    # Currently being super strict about naming. Might be a good idea to relax. Starting strict.
    fn_positionals, input_args = split_function_parameters(compute_fn, expected_positionals)

    # Validate Positional Parameters
    missing_positional = validate_decorated_fn_positionals(fn_positionals, expected_positionals)
    if missing_positional:
        raise DagsterInvalidDefinitionError(
            "{decorator_name} '{solid_name}' decorated function does not have required positional "
            "parameter '{missing_param}'. Solid functions should only have keyword arguments "
            "that match input names and a first positional parameter named 'context'.".format(
                decorator_name=decorator_name, solid_name=fn_name, missing_param=missing_positional
            )
        )

    # Validate non positional parameters
    invalid_function_info = validate_decorated_fn_input_args(names, input_args)
    if invalid_function_info:
        if invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES['vararg']:
            raise DagsterInvalidDefinitionError(
                "{decorator_name} '{solid_name}' decorated function has positional vararg parameter "
                "'{param}'. Solid functions should only have keyword arguments that match "
                "input names and a first positional parameter named 'context'.".format(
                    decorator_name=decorator_name,
                    solid_name=fn_name,
                    param=invalid_function_info.param,
                )
            )
        elif invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES['missing_name']:
            if invalid_function_info.param in nothing_names:
                raise DagsterInvalidDefinitionError(
                    "{decorator_name} '{solid_name}' decorated function has parameter '{param}' that is "
                    "one of the solid input_defs of type 'Nothing' which should not be included since "
                    "no data will be passed for it. ".format(
                        decorator_name=decorator_name,
                        solid_name=fn_name,
                        param=invalid_function_info.param,
                    )
                )
            else:
                raise DagsterInvalidDefinitionError(
                    "{decorator_name} '{solid_name}' decorated function has parameter '{param}' that is not "
                    "one of the solid input_defs. Solid functions should only have keyword arguments "
                    "that match input names and a first positional parameter named 'context'.".format(
                        decorator_name=decorator_name,
                        solid_name=fn_name,
                        param=invalid_function_info.param,
                    )
                )
        elif invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES['extra']:
            undeclared_inputs_printed = ", '".join(invalid_function_info.missing_names)
            raise DagsterInvalidDefinitionError(
                "{decorator_name} '{solid_name}' decorated function does not have parameter(s) "
                "'{undeclared_inputs_printed}', which are in solid's input_defs. Solid functions "
                "should only have keyword arguments that match input names and a first positional "
                "parameter named 'context'.".format(
                    decorator_name=decorator_name,
                    solid_name=fn_name,
                    undeclared_inputs_printed=undeclared_inputs_printed,
                )
            )

    return positional_arg_name_list(input_args)
Example #7
0
    def build_memoized_plan(self, step_keys_to_execute, addresses):
        """Using cached outputs from previous runs, create a new execution plan.

        For steps where values have been cached, addresses are provided so that at runtime, those
        steps do not need to re-execute.

        Args:
            step_keys_to_execute (List[String]): A list of execution step keys to actually run in this
                execution plan.
            addresses: (Dict[(str, StepOutputHandle), str]): A dictionary mapping pipeline name and
                step output handle to an "address", which the intermediate storage can use to
                retrieve the value for this step output.
        Returns:
            ExecutionPlan: An execution plan where addresses have been provided to steps such that
                the intermediate storage layer can retrieve the addresses instead of searching for
                the output from within the current run.
        """
        check.list_param(step_keys_to_execute,
                         "step_keys_to_execute",
                         of_type=str)
        check.dict_param(addresses, "addresses")
        pipeline_name = self.pipeline_def.name
        memoized_plan_step_dict = self.step_dict.copy()
        for step_key in step_keys_to_execute:
            step_inputs = []
            step = memoized_plan_step_dict[step_key]
            for step_input in step.step_inputs:
                if step_input.is_from_output:
                    address_dict = {
                        source_handle:
                        addresses[(pipeline_name, source_handle)]
                        for source_handle in step_input.source_handles
                        if (pipeline_name, source_handle) in addresses
                    }
                    reconstructed_step_input = StepInput(
                        step_input.name,
                        dagster_type=step_input.dagster_type,
                        source_type=step_input.source_type,
                        source_handles=step_input.source_handles,
                        config_data=None,
                        addresses=address_dict,
                    )
                    step_inputs.append(reconstructed_step_input)
                else:
                    step_inputs.append(step_input)

            memoized_step = ExecutionStep(
                pipeline_name=step.pipeline_name,
                key_suffix=step.key_suffix,
                step_inputs=step_inputs,
                step_outputs=step.step_outputs,
                compute_fn=step.compute_fn,
                kind=step.kind,
                solid_handle=step.solid_handle,
                solid=step.solid,
                logging_tags=step.logging_tags,
            )
            memoized_plan_step_dict[step_key] = memoized_step

        return ExecutionPlan(
            self.pipeline,
            memoized_plan_step_dict,
            self.deps,
            self.artifacts_persisted,
            step_keys_to_execute,
        )
Example #8
0
 def __new__(cls, field_names):
     return super(FieldsNotDefinedErrorData, cls).__new__(
         cls, check.list_param(field_names, 'field_names', of_type=str))
Example #9
0
 def __new__(cls, dagster_type, incoming_fields):
     check.param_invariant(dagster_type.is_selector, 'dagster_type')
     return super(SelectorTypeErrorData, cls).__new__(
         cls, dagster_type,
         check.list_param(incoming_fields, 'incoming_fields', of_type=str))
Example #10
0
def check_events_for_failures(events):
    check.list_param(events, "events", of_type=DagsterEvent)
    for event in events:
        if event.event_type_value == "STEP_FAILURE":
            raise AirflowException("step failed with error: %s" %
                                   event.event_specific_data.error.to_string())
Example #11
0
 def __init__(self, partition_keys: List[str]):
     check.list_param(partition_keys, "partition_keys", of_type=str)
     self._partitions = [Partition(key) for key in partition_keys]
Example #12
0
 def __init__(self, tuple_types, **kwargs):
     self.tuple_types = check.list_param(tuple_types,
                                         'tuple_types',
                                         of_type=ConfigType)
     super(ConfigTuple, self).__init__(kind=ConfigTypeKind.TUPLE, **kwargs)
Example #13
0
 def __init__(self, dagster_types):
     self._dagster_types = check.list_param(dagster_types,
                                            "dagster_types",
                                            of_type=DagsterType)
Example #14
0
File: runs.py Project: sd2k/dagster
    def __init__(self, root_run_id, runs):
        check.str_param(root_run_id, "root_run_id")
        check.list_param(runs, "runs", DauphinPipelineRun)

        super(DauphinRunGroup, self).__init__(rootRunId=root_run_id, runs=runs)
Example #15
0
def create_execution_structure(solid_defs, dependencies_dict, graph_definition):
    """This builder takes the dependencies dictionary specified during creation of the
    PipelineDefinition object and builds (1) the execution structure and (2) a solid dependency
    dictionary.

    For example, for the following dependencies:

    dep_dict = {
            SolidInvocation('giver'): {},
            SolidInvocation('sleeper', alias='sleeper_1'): {
                'units': DependencyDefinition('giver', 'out_1')
            },
            SolidInvocation('sleeper', alias='sleeper_2'): {
                'units': DependencyDefinition('giver', 'out_2')
            },
            SolidInvocation('sleeper', alias='sleeper_3'): {
                'units': DependencyDefinition('giver', 'out_3')
            },
            SolidInvocation('sleeper', alias='sleeper_4'): {
                'units': DependencyDefinition('giver', 'out_4')
            },
            SolidInvocation('total'): {
                'in_1': DependencyDefinition('sleeper_1', 'total'),
                'in_2': DependencyDefinition('sleeper_2', 'total'),
                'in_3': DependencyDefinition('sleeper_3', 'total'),
                'in_4': DependencyDefinition('sleeper_4', 'total'),
            },
        },

    This will create:

    pipeline_solid_dict = {
        'giver': <dagster.core.definitions.dependency.Solid object>,
        'sleeper_1': <dagster.core.definitions.dependency.Solid object>,
        'sleeper_2': <dagster.core.definitions.dependency.Solid object>,
        'sleeper_3': <dagster.core.definitions.dependency.Solid object>,
        'sleeper_4': <dagster.core.definitions.dependency.Solid object>,
        'total': <dagster.core.definitions.dependency.Solid object>
    }

    as well as a dagster.core.definitions.dependency.DependencyStructure object.
    """
    from .solid import NodeDefinition
    from .graph import GraphDefinition

    check.list_param(solid_defs, "solid_defs", of_type=NodeDefinition)
    check.dict_param(
        dependencies_dict,
        "dependencies_dict",
        key_type=six.string_types + (SolidInvocation,),
        value_type=dict,
    )
    # graph_definition is none in the context of a pipeline
    check.inst_param(graph_definition, "graph_definition", GraphDefinition)

    # Same as dep_dict but with SolidInvocation replaced by alias string
    aliased_dependencies_dict = {}

    # Keep track of solid name -> all aliases used and alias -> name
    name_to_aliases = defaultdict(set)
    alias_to_solid_instance = {}
    alias_to_name = {}

    for solid_key, input_dep_dict in dependencies_dict.items():
        # We allow deps of the form dependencies={'foo': DependencyDefinition('bar')}
        # Here, we replace 'foo' with SolidInvocation('foo')
        if not isinstance(solid_key, SolidInvocation):
            solid_key = SolidInvocation(solid_key)

        alias = solid_key.alias or solid_key.name

        name_to_aliases[solid_key.name].add(alias)
        alias_to_solid_instance[alias] = solid_key
        alias_to_name[alias] = solid_key.name
        aliased_dependencies_dict[alias] = input_dep_dict

    pipeline_solid_dict = _build_pipeline_solid_dict(
        solid_defs, name_to_aliases, alias_to_solid_instance, graph_definition
    )

    _validate_dependencies(aliased_dependencies_dict, pipeline_solid_dict, alias_to_name)

    dependency_structure = DependencyStructure.from_definitions(
        pipeline_solid_dict, aliased_dependencies_dict
    )

    return dependency_structure, pipeline_solid_dict
Example #16
0
 def __new__(cls, config_type, entries):
     return super(EvaluationStack, cls).__new__(
         cls,
         check.inst_param(config_type, 'config_type', ConfigType),
         check.list_param(entries, 'entries', of_type=EvaluationStackEntry),
     )
Example #17
0
def _create_solid_compute_wrapper(fn, input_defs, output_defs):
    check.callable_param(fn, 'fn')
    check.list_param(input_defs, 'input_defs', of_type=InputDefinition)
    check.list_param(output_defs, 'output_defs', of_type=OutputDefinition)

    input_names = [
        input_def.name
        for input_def in input_defs
        if not input_def.dagster_type.kind == DagsterTypeKind.NOTHING
    ]

    @wraps(fn)
    def compute(context, input_defs):
        kwargs = {}
        for input_name in input_names:
            kwargs[input_name] = input_defs[input_name]

        result = fn(context, **kwargs)

        if inspect.isgenerator(result):
            for item in result:
                yield item
        else:
            if isinstance(result, (Materialization, ExpectationResult)):
                raise DagsterInvariantViolationError(
                    (
                        'Error in solid {solid_name}: If you are returning a Materialization '
                        'or an ExpectationResult from solid you must yield them to avoid '
                        'ambiguity with an implied result from returning a value.'.format(
                            solid_name=context.solid.name
                        )
                    )
                )

            if isinstance(result, Output):
                yield result
            elif len(output_defs) == 1:
                yield Output(value=result, output_name=output_defs[0].name)
            elif result is not None:
                if not output_defs:
                    raise DagsterInvariantViolationError(
                        (
                            'Error in solid {solid_name}: Unexpectedly returned output {result} '
                            'of type {type_}. Solid is explicitly defined to return no '
                            'results.'
                        ).format(solid_name=context.solid.name, result=result, type_=type(result))
                    )

                raise DagsterInvariantViolationError(
                    (
                        'Error in solid {solid_name}: Solid unexpectedly returned '
                        'output {result} of type {type_}. Should '
                        'be a generator, containing or yielding '
                        '{n_results} results: {{{expected_results}}}.'
                    ).format(
                        solid_name=context.solid.name,
                        result=result,
                        type_=type(result),
                        n_results=len(output_defs),
                        expected_results=', '.join(
                            [
                                '\'{result_name}\': {dagster_type}'.format(
                                    result_name=output_def.name,
                                    dagster_type=output_def.dagster_type,
                                )
                                for output_def in output_defs
                            ]
                        ),
                    )
                )

    return compute
Example #18
0
def validate_solid_fn(
    decorator_name, fn_name, compute_fn, input_defs, expected_positionals=None, exclude_nothing=True
):
    check.str_param(decorator_name, 'decorator_name')
    check.str_param(fn_name, 'fn_name')
    check.callable_param(compute_fn, 'compute_fn')
    check.list_param(input_defs, 'input_defs', of_type=InputDefinition)
    expected_positionals = check.opt_list_param(
        expected_positionals, 'expected_positionals', of_type=(str, tuple)
    )
    if exclude_nothing:
        names = set(inp.name for inp in input_defs if not inp.runtime_type.is_nothing)
        nothing_names = set(inp.name for inp in input_defs if inp.runtime_type.is_nothing)
    else:
        names = set(inp.name for inp in input_defs)
        nothing_names = set()

    # Currently being super strict about naming. Might be a good idea to relax. Starting strict.
    try:
        _validate_decorated_fn(compute_fn, names, expected_positionals)
    except FunctionValidationError as e:
        if e.error_type == FunctionValidationError.TYPES['vararg']:
            raise DagsterInvalidDefinitionError(
                "{decorator_name} '{solid_name}' decorated function has positional vararg parameter "
                "'{e.param}'. Solid functions should only have keyword arguments that match "
                "input names and a first positional parameter named 'context'.".format(
                    decorator_name=decorator_name, solid_name=fn_name, e=e
                )
            )
        elif e.error_type == FunctionValidationError.TYPES['missing_name']:
            if e.param in nothing_names:
                raise DagsterInvalidDefinitionError(
                    "{decorator_name} '{solid_name}' decorated function has parameter '{e.param}' that is "
                    "one of the solid input_defs of type 'Nothing' which should not be included since "
                    "no data will be passed for it. ".format(
                        decorator_name=decorator_name, solid_name=fn_name, e=e
                    )
                )
            else:
                raise DagsterInvalidDefinitionError(
                    "{decorator_name} '{solid_name}' decorated function has parameter '{e.param}' that is not "
                    "one of the solid input_defs. Solid functions should only have keyword arguments "
                    "that match input names and a first positional parameter named 'context'.".format(
                        decorator_name=decorator_name, solid_name=fn_name, e=e
                    )
                )
        elif e.error_type == FunctionValidationError.TYPES['missing_positional']:
            raise DagsterInvalidDefinitionError(
                "{decorator_name} '{solid_name}' decorated function does not have required positional "
                "parameter '{e.param}'. Solid functions should only have keyword arguments "
                "that match input names and a first positional parameter named 'context'.".format(
                    decorator_name=decorator_name, solid_name=fn_name, e=e
                )
            )
        elif e.error_type == FunctionValidationError.TYPES['extra']:
            undeclared_inputs_printed = ", '".join(e.missing_names)
            raise DagsterInvalidDefinitionError(
                "{decorator_name} '{solid_name}' decorated function does not have parameter(s) "
                "'{undeclared_inputs_printed}', which are in solid's input_defs. Solid functions "
                "should only have keyword arguments that match input names and a first positional "
                "parameter named 'context'.".format(
                    decorator_name=decorator_name,
                    solid_name=fn_name,
                    undeclared_inputs_printed=undeclared_inputs_printed,
                )
            )
        else:
            raise e
Example #19
0
    def __init__(self,
                 solids,
                 name=None,
                 description=None,
                 context_definitions=None,
                 dependencies=None):
        '''
        Args:
            solids (List[SolidDefinition]): Solids in the pipeline
            name (str): Name. This is optional, mostly for situations that require ephemeral
                pipeline definitions for fast scaffolding or testing.
            description (str): Description of the pipline.
            context_definitions (Dict[str, PipelineContextDefinition]): See class description.
            dependencies: (Dict[str, Dict[str, DependencyDefinition]]): See class description.
        '''
        self.name = check.opt_str_param(name, 'name', '<<unnamed>>')
        self.description = check.opt_str_param(description, 'description')

        check.list_param(solids, 'solids')

        if context_definitions is None:
            context_definitions = default_pipeline_context_definitions()

        self.context_definitions = check.dict_param(
            context_definitions,
            'context_definitions',
            key_type=str,
            value_type=PipelineContextDefinition,
        )

        self.dependencies = check.opt_two_dim_dict_param(
            dependencies,
            'dependencies',
            key_type=six.string_types + (SolidInstance, ),
            value_type=DependencyDefinition,
        )

        dependency_structure, pipeline_solid_dict = create_execution_structure(
            solids, self.dependencies)

        self._solid_dict = pipeline_solid_dict
        self.dependency_structure = dependency_structure

        self.environment_cls = define_environment_cls(
            EnvironmentClassCreationData(
                self.name,
                list(self._solid_dict.values()),
                context_definitions,
                dependency_structure,
            ))
        self.environment_type = self.environment_cls.inst()

        self.context_cls = define_context_cls(self)
        self.context_type = self.context_cls.inst()

        (
            self._config_type_dict_by_name,
            self._config_type_dict_by_key,
        ) = construct_config_type_dictionary(solids, self.context_definitions,
                                             self.environment_type)

        self._runtime_type_dict = construct_runtime_type_dictionary(solids)
Example #20
0
    def execute_queries(self,
                        queries,
                        fetch_results=False,
                        cursor_factory=None,
                        error_callback=None):
        '''Synchronously execute a list of queries against Redshift. Will return a list of list of
        rows, where each row is a tuple of values, e.g. ['SELECT 1', 'SELECT 1'] will return
        [[(1,)], [(1,)]].

        Args:
            queries (List[str]): The queries to execute.
            fetch_results (Optional[bool]): Whether to return the results of executing the query.
                Defaults to False, in which case the query will be executed without retrieving the
                results.
            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative
            cursor_factory; defaults to None. Will be used when constructing the cursor.
            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A
                callback function, invoked when an exception is encountered during query execution;
                this is intended to support executing additional queries to provide diagnostic
                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no
                function is provided, exceptions during query execution will be raised directly.

        Returns:
            Optional[List[List[Tuple[Any, ...]]]]: Results of the query, as a list of list of
                tuples, when fetch_results is set. Otherwise return None.
        '''
        check.list_param(queries, 'queries', of_type=str)
        check.bool_param(fetch_results, 'fetch_results')
        check.opt_subclass_param(cursor_factory, 'cursor_factory',
                                 psycopg2.extensions.cursor)
        check.opt_callable_param(error_callback, 'error_callback')

        results = []
        with self._get_conn() as conn:
            with self._get_cursor(conn,
                                  cursor_factory=cursor_factory) as cursor:
                for query in queries:
                    six.ensure_str(query)

                    try:
                        self.log.info(
                            'Executing query \'{query}\''.format(query=query))
                        cursor.execute(query)

                        if fetch_results and cursor.rowcount > 0:
                            results.append(cursor.fetchall())
                        else:
                            results.append([])
                            self.log.info('Empty result from query')

                    except Exception as e:  # pylint: disable=broad-except
                        # If autocommit is disabled or not set (it is disabled by default), Redshift
                        # will be in the middle of a transaction at exception time, and because of
                        # the failure the current transaction will not accept any further queries.
                        #
                        # This conn.commit() call closes the open transaction before handing off
                        # control to the error callback, so that the user can issue additional
                        # queries. Notably, for e.g. pg_last_copy_id() to work, it requires you to
                        # use the same conn/cursor, so you have to do this conn.commit() to ensure
                        # things are in a usable state in the error callback.
                        if not self.autocommit:
                            conn.commit()

                        if error_callback is not None:
                            error_callback(e, cursor, self.log)
                        else:
                            raise

        if fetch_results:
            return results
Example #21
0
def construct_dagster_k8s_job(
    job_config,
    args,
    job_name,
    user_defined_k8s_config=None,
    pod_name=None,
    component=None,
    labels=None,
    env_vars=None,
):
    """Constructs a Kubernetes Job object for a dagster-graphql invocation.

    Args:
        job_config (DagsterK8sJobConfig): Job configuration to use for constructing the Kubernetes
            Job object.
        args (List[str]): CLI arguments to use with dagster-graphql in this Job.
        job_name (str): The name of the Job. Note that this name must be <= 63 characters in length.
        resources (Dict[str, Dict[str, str]]): The resource requirements for the container
        pod_name (str, optional): The name of the Pod. Note that this name must be <= 63 characters
            in length. Defaults to "<job_name>-pod".
        component (str, optional): The name of the component, used to provide the Job label
            app.kubernetes.io/component. Defaults to None.
        labels(Dict[str, str]): Additional labels to be attached to k8s jobs and pod templates.
            Long label values are may be truncated.
        env_vars(Dict[str, str]): Additional environment variables to add to the K8s Container.

    Returns:
        kubernetes.client.V1Job: A Kubernetes Job object.
    """
    check.inst_param(job_config, "job_config", DagsterK8sJobConfig)
    check.list_param(args, "args", of_type=str)
    check.str_param(job_name, "job_name")
    user_defined_k8s_config = check.opt_inst_param(
        user_defined_k8s_config,
        "user_defined_k8s_config",
        UserDefinedDagsterK8sConfig,
        default=UserDefinedDagsterK8sConfig(),
    )

    pod_name = check.opt_str_param(pod_name, "pod_name", default=job_name + "-pod")
    check.opt_str_param(component, "component")
    check.opt_dict_param(env_vars, "env_vars", key_type=str, value_type=str)
    check.opt_dict_param(labels, "labels", key_type=str, value_type=str)

    check.invariant(
        len(job_name) <= MAX_K8S_NAME_LEN,
        "job_name is %d in length; Kubernetes Jobs cannot be longer than %d characters."
        % (len(job_name), MAX_K8S_NAME_LEN),
    )

    check.invariant(
        len(pod_name) <= MAX_K8S_NAME_LEN,
        "job_name is %d in length; Kubernetes Pods cannot be longer than %d characters."
        % (len(pod_name), MAX_K8S_NAME_LEN),
    )

    # See: https://kubernetes.io/docs/concepts/overview/working-with-objects/common-labels/
    k8s_common_labels = {
        "app.kubernetes.io/name": "dagster",
        "app.kubernetes.io/instance": "dagster",
        "app.kubernetes.io/version": dagster_version,
        "app.kubernetes.io/part-of": "dagster",
    }

    if component:
        k8s_common_labels["app.kubernetes.io/component"] = component

    additional_labels = {
        # Truncate too long label values to fit into 63-characters limit.
        # https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#syntax-and-character-set
        k: v[:63]
        for k, v in (labels or {}).items()
    }
    dagster_labels = merge_dicts(k8s_common_labels, additional_labels)

    env = [kubernetes.client.V1EnvVar(name="DAGSTER_HOME", value=job_config.dagster_home)]
    if job_config.postgres_password_secret:
        env.append(
            kubernetes.client.V1EnvVar(
                name=DAGSTER_PG_PASSWORD_ENV_VAR,
                value_from=kubernetes.client.V1EnvVarSource(
                    secret_key_ref=kubernetes.client.V1SecretKeySelector(
                        name=job_config.postgres_password_secret, key=DAGSTER_PG_PASSWORD_SECRET_KEY
                    )
                ),
            )
        )

    additional_k8s_env_vars = []
    if env_vars:
        for key, value in env_vars.items():
            additional_k8s_env_vars.append(kubernetes.client.V1EnvVar(name=key, value=value))

    user_defined_k8s_env_vars = user_defined_k8s_config.container_config.pop("env", [])
    for env_var in user_defined_k8s_env_vars:
        additional_k8s_env_vars.append(
            k8s_model_from_dict(kubernetes.client.models.V1EnvVar, env_var)
        )

    user_defined_k8s_env_from = user_defined_k8s_config.container_config.pop("env_from", [])
    additional_k8s_env_from = []
    for env_from in user_defined_k8s_env_from:
        config_map_ref_args = env_from.get("config_map_ref")
        config_map_ref = (
            kubernetes.client.V1ConfigMapEnvSource(**config_map_ref_args)
            if config_map_ref_args
            else None
        )
        secret_ref_args = env_from.get("secret_ref")
        secret_ref = (
            kubernetes.client.V1SecretEnvSource(**secret_ref_args) if secret_ref_args else None
        )

        additional_k8s_env_from.append(
            kubernetes.client.V1EnvFromSource(
                config_map_ref=config_map_ref, prefix=env_from.get("prefix"), secret_ref=secret_ref
            )
        )

    volume_mounts = [
        kubernetes.client.V1VolumeMount(
            name="dagster-instance",
            mount_path="{dagster_home}/dagster.yaml".format(dagster_home=job_config.dagster_home),
            sub_path="dagster.yaml",
        )
    ] + [
        k8s_model_from_dict(kubernetes.client.models.V1VolumeMount, mount)
        for mount in job_config.volume_mounts
    ]

    job_image = user_defined_k8s_config.container_config.pop("image", job_config.job_image)

    user_defined_k8s_volume_mounts = user_defined_k8s_config.container_config.pop(
        "volume_mounts", []
    )
    for volume_mount in user_defined_k8s_volume_mounts:
        volume_mounts.append(
            k8s_model_from_dict(kubernetes.client.models.V1VolumeMount, volume_mount)
        )

    job_container = kubernetes.client.V1Container(
        name="dagster",
        image=job_image,
        args=args,
        image_pull_policy=job_config.image_pull_policy,
        env=env + job_config.env + additional_k8s_env_vars,
        env_from=job_config.env_from_sources + additional_k8s_env_from,
        volume_mounts=volume_mounts,
        **user_defined_k8s_config.container_config,
    )

    user_defined_volumes = user_defined_k8s_config.pod_spec_config.pop("volumes", [])

    volumes = [
        kubernetes.client.V1Volume(
            name="dagster-instance",
            config_map=kubernetes.client.V1ConfigMapVolumeSource(
                name=job_config.instance_config_map
            ),
        )
    ]

    for volume in job_config.volumes + user_defined_volumes:
        new_volume = k8s_model_from_dict(
            kubernetes.client.models.V1Volume,
            volume,
        )
        volumes.append(new_volume)

    # If the user has defined custom labels, remove them from the pod_template_spec_metadata
    # key and merge them with the dagster labels
    user_defined_pod_template_labels = user_defined_k8s_config.pod_template_spec_metadata.pop(
        "labels", {}
    )

    service_account_name = user_defined_k8s_config.pod_spec_config.pop(
        "service_account_name", job_config.service_account_name
    )

    template = kubernetes.client.V1PodTemplateSpec(
        metadata=kubernetes.client.V1ObjectMeta(
            name=pod_name,
            labels=merge_dicts(dagster_labels, user_defined_pod_template_labels),
            **user_defined_k8s_config.pod_template_spec_metadata,
        ),
        spec=kubernetes.client.V1PodSpec(
            image_pull_secrets=[
                kubernetes.client.V1LocalObjectReference(name=x["name"])
                for x in job_config.image_pull_secrets
            ],
            service_account_name=service_account_name,
            restart_policy="Never",
            containers=[job_container],
            volumes=volumes,
            **user_defined_k8s_config.pod_spec_config,
        ),
    )

    job_spec_config = merge_dicts(
        DEFAULT_JOB_SPEC_CONFIG,
        user_defined_k8s_config.job_spec_config,
    )

    job = kubernetes.client.V1Job(
        api_version="batch/v1",
        kind="Job",
        metadata=kubernetes.client.V1ObjectMeta(
            name=job_name, labels=dagster_labels, **user_defined_k8s_config.job_metadata
        ),
        spec=kubernetes.client.V1JobSpec(
            template=template,
            **job_spec_config,
        ),
        **user_defined_k8s_config.job_config,
    )
    return job
Example #22
0
    def _execute_step_k8s_job(
        self,
        instance_ref_dict,
        step_keys,
        run_config,
        mode,
        repo_name,
        repo_location_name,
        run_id,
        job_config_dict,
        job_namespace,
        load_incluster_config,
        retries_dict,
        pipeline_origin_packed,
        user_defined_k8s_config_dict=None,
        kubeconfig_file=None,
    ):
        """Run step execution in a K8s job pod.
        """

        check.dict_param(instance_ref_dict, "instance_ref_dict")
        check.list_param(step_keys, "step_keys", of_type=str)
        check.invariant(
            len(step_keys) == 1,
            "Celery K8s task executor can only execute 1 step at a time")
        check.dict_param(run_config, "run_config")
        check.str_param(mode, "mode")
        check.str_param(repo_name, "repo_name")
        check.str_param(repo_location_name, "repo_location_name")
        check.str_param(run_id, "run_id")

        # Celery will serialize this as a list
        job_config = DagsterK8sJobConfig.from_dict(job_config_dict)
        check.inst_param(job_config, "job_config", DagsterK8sJobConfig)
        check.str_param(job_namespace, "job_namespace")

        check.bool_param(load_incluster_config, "load_incluster_config")
        check.dict_param(retries_dict, "retries_dict")

        pipeline_origin = unpack_value(
            check.dict_param(
                pipeline_origin_packed,
                "pipeline_origin_packed")  # TODO: make part of args
        )
        check.inst(pipeline_origin, PipelineOrigin)

        user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict(
            user_defined_k8s_config_dict)
        check.opt_inst_param(
            user_defined_k8s_config,
            "user_defined_k8s_config",
            UserDefinedDagsterK8sConfig,
        )
        check.opt_str_param(kubeconfig_file, "kubeconfig_file")

        # For when launched via DinD or running the cluster
        if load_incluster_config:
            kubernetes.config.load_incluster_config()
        else:
            kubernetes.config.load_kube_config(kubeconfig_file)

        instance_ref = InstanceRef.from_dict(instance_ref_dict)
        instance = DagsterInstance.from_ref(instance_ref)
        pipeline_run = instance.get_run_by_id(run_id)

        check.invariant(pipeline_run, "Could not load run {}".format(run_id))
        step_key = step_keys[0]

        celery_worker_name = self.request.hostname
        celery_pod_name = os.environ.get("HOSTNAME")
        instance.report_engine_event(
            "Task for step {step_key} picked up by Celery".format(
                step_key=step_key),
            pipeline_run,
            EngineEventData([
                EventMetadataEntry.text(celery_worker_name,
                                        "Celery worker name"),
                EventMetadataEntry.text(celery_pod_name,
                                        "Celery worker Kubernetes Pod name"),
            ]),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )

        if pipeline_run.status != PipelineRunStatus.STARTED:
            instance.report_engine_event(
                "Not scheduling step because pipeline run status is not STARTED",
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(step_key, "Step key"),
                ]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return []

        # Ensure we stay below k8s name length limits
        k8s_name_key = get_k8s_job_name(run_id, step_key)

        retries = Retries.from_config(retries_dict)

        if retries.get_attempt_count(step_key):
            attempt_number = retries.get_attempt_count(step_key)
            job_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number)
            pod_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number)
        else:
            job_name = "dagster-job-%s" % (k8s_name_key)
            pod_name = "dagster-job-%s" % (k8s_name_key)

        input_json = serialize_dagster_namedtuple(
            ExecuteStepArgs(
                pipeline_origin=pipeline_origin,
                pipeline_run_id=run_id,
                instance_ref=None,
                mode=mode,
                step_keys_to_execute=step_keys,
                run_config=run_config,
                retries_dict=retries_dict,
            ))
        command = ["dagster"]
        args = ["api", "execute_step_with_structured_logs", input_json]

        job = construct_dagster_k8s_job(job_config, command, args, job_name,
                                        user_defined_k8s_config, pod_name)

        # Running list of events generated from this task execution
        events = []

        # Post event for starting execution
        job_name = job.metadata.name
        engine_event = instance.report_engine_event(
            "Executing step {} in Kubernetes job {}".format(
                step_key, job_name),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_key, "Step key"),
                    EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                    EventMetadataEntry.text(pod_name, "Kubernetes Pod name"),
                    EventMetadataEntry.text(job_config.job_image, "Job image"),
                    EventMetadataEntry.text(job_config.image_pull_policy,
                                            "Image pull policy"),
                    EventMetadataEntry.text(str(job_config.image_pull_secrets),
                                            "Image pull secrets"),
                    EventMetadataEntry.text(
                        str(job_config.service_account_name),
                        "Service account name"),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryK8sJobExecutor,
            # validated above that step_keys is length 1, and it is not possible to use ETH or
            # execution plan in this function (Celery K8s workers should not access to user code)
            step_key=step_key,
        )
        events.append(engine_event)
        try:
            kubernetes.client.BatchV1Api().create_namespaced_job(
                body=job, namespace=job_namespace)
        except kubernetes.client.rest.ApiException as e:
            if e.reason == "Conflict":
                # There is an existing job with the same name so do not procede.
                instance.report_engine_event(
                    "Did not create Kubernetes job {} for step {} since job name already "
                    "exists, exiting.".format(job_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            EventMetadataEntry.text(step_key, "Step key"),
                            EventMetadataEntry.text(job_name,
                                                    "Kubernetes Job name"),
                            EventMetadataEntry.text(pod_name,
                                                    "Kubernetes Pod name"),
                        ],
                        marker_end=DELEGATE_MARKER,
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )
            else:
                instance.report_engine_event(
                    "Encountered unexpected error while creating Kubernetes job {} for step {}, "
                    "exiting.".format(job_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            EventMetadataEntry.text(step_key, "Step key"),
                        ],
                        error=serializable_error_info_from_exc_info(
                            sys.exc_info()),
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )
            return []

        try:
            wait_for_job_success(
                job_name=job_name,
                namespace=job_namespace,
                instance=instance,
                run_id=run_id,
            )
        except (DagsterK8sError, DagsterK8sTimeoutError) as err:
            step_failure_event = construct_step_failure_event_and_handle(
                pipeline_run, step_key, err, instance=instance)
            events.append(step_failure_event)
        except DagsterK8sPipelineStatusException:
            instance.report_engine_event(
                "Terminating Kubernetes Job because pipeline run status is not STARTED",
                pipeline_run,
                EngineEventData([
                    EventMetadataEntry.text(step_key, "Step key"),
                    EventMetadataEntry.text(job_name, "Kubernetes Job name"),
                    EventMetadataEntry.text(job_namespace,
                                            "Kubernetes Job namespace"),
                ]),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            delete_job(job_name=job_name, namespace=job_namespace)
            return []
        except (
                DagsterK8sUnrecoverableAPIError,
                DagsterK8sAPIRetryLimitExceeded,
                # We shouldn't see unwrapped APIExceptions anymore, as they should all be wrapped in
                # a retry boundary. We still catch it here just in case we missed one so that we can
                # report it to the event log
                kubernetes.client.rest.ApiException,
        ) as err:
            instance.report_engine_event(
                "Encountered unexpected error while waiting on Kubernetes job {} for step {}, "
                "exiting.".format(job_name, step_key),
                pipeline_run,
                EngineEventData(
                    [
                        EventMetadataEntry.text(step_key, "Step key"),
                    ],
                    error=serializable_error_info_from_exc_info(
                        sys.exc_info()),
                ),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return []

        try:
            pod_names = get_pod_names_in_job(job_name, namespace=job_namespace)
        except kubernetes.client.rest.ApiException as e:
            instance.report_engine_event(
                "Encountered unexpected error retreiving Pods for Kubernetes job {} for step {}, "
                "exiting.".format(job_name, step_key),
                pipeline_run,
                EngineEventData(
                    [
                        EventMetadataEntry.text(step_key, "Step key"),
                    ],
                    error=serializable_error_info_from_exc_info(
                        sys.exc_info()),
                ),
                CeleryK8sJobExecutor,
                step_key=step_key,
            )
            return []

        # Post engine event for log retrieval
        engine_event = instance.report_engine_event(
            "Retrieving logs from Kubernetes Job pods",
            pipeline_run,
            EngineEventData(
                [EventMetadataEntry.text("\n".join(pod_names), "Pod names")]),
            CeleryK8sJobExecutor,
            step_key=step_key,
        )
        events.append(engine_event)

        logs = []
        for pod_name in pod_names:
            try:
                raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace)
                logs += raw_logs.split("\n")
            except kubernetes.client.rest.ApiException as e:
                instance.report_engine_event(
                    "Encountered unexpected error while fetching pod logs for Kubernetes job {}, "
                    "Pod name {} for step {}. Will attempt to continue with other pods."
                    .format(job_name, pod_name, step_key),
                    pipeline_run,
                    EngineEventData(
                        [
                            EventMetadataEntry.text(step_key, "Step key"),
                        ],
                        error=serializable_error_info_from_exc_info(
                            sys.exc_info()),
                    ),
                    CeleryK8sJobExecutor,
                    step_key=step_key,
                )

        events += filter_dagster_events_from_pod_logs(logs)
        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events
 def uri_for_paths(self, paths, protocol=None):
     check.list_param(paths, 'paths', of_type=str)
     check.param_invariant(len(paths) > 0, 'paths')
     key = self.key_for_paths(paths)
     return self.object_store.uri_for_key(key, protocol)
Example #24
0
 def _iterate_errors_at_level(self, levels):
     check.list_param(levels, 'levels', of_type=str)
     for error in self.errors:
         if error.stack.levels == levels:
             yield error
Example #25
0
def iterate_metadata_entries(metadata_entries):
    from ..schema.logs.events import (
        GrapheneEventFloatMetadataEntry,
        GrapheneEventIntMetadataEntry,
        GrapheneEventJsonMetadataEntry,
        GrapheneEventMarkdownMetadataEntry,
        GrapheneEventPathMetadataEntry,
        GrapheneEventPythonArtifactMetadataEntry,
        GrapheneEventTextMetadataEntry,
        GrapheneEventUrlMetadataEntry,
        GrapheneEventPipelineRunMetadataEntry,
        GrapheneEventAssetMetadataEntry,
    )

    check.list_param(metadata_entries,
                     "metadata_entries",
                     of_type=EventMetadataEntry)
    for metadata_entry in metadata_entries:
        if isinstance(metadata_entry.entry_data, PathMetadataEntryData):
            yield GrapheneEventPathMetadataEntry(
                label=metadata_entry.label,
                description=metadata_entry.description,
                path=metadata_entry.entry_data.path,
            )
        elif isinstance(metadata_entry.entry_data, JsonMetadataEntryData):
            yield GrapheneEventJsonMetadataEntry(
                label=metadata_entry.label,
                description=metadata_entry.description,
                jsonString=seven.json.dumps(metadata_entry.entry_data.data),
            )
        elif isinstance(metadata_entry.entry_data, TextMetadataEntryData):
            yield GrapheneEventTextMetadataEntry(
                label=metadata_entry.label,
                description=metadata_entry.description,
                text=metadata_entry.entry_data.text,
            )
        elif isinstance(metadata_entry.entry_data, UrlMetadataEntryData):
            yield GrapheneEventUrlMetadataEntry(
                label=metadata_entry.label,
                description=metadata_entry.description,
                url=metadata_entry.entry_data.url,
            )
        elif isinstance(metadata_entry.entry_data, MarkdownMetadataEntryData):
            yield GrapheneEventMarkdownMetadataEntry(
                label=metadata_entry.label,
                description=metadata_entry.description,
                md_str=metadata_entry.entry_data.md_str,
            )
        elif isinstance(metadata_entry.entry_data,
                        PythonArtifactMetadataEntryData):
            yield GrapheneEventPythonArtifactMetadataEntry(
                label=metadata_entry.label,
                description=metadata_entry.description,
                module=metadata_entry.entry_data.module,
                name=metadata_entry.entry_data.name,
            )
        elif isinstance(metadata_entry.entry_data, FloatMetadataEntryData):
            float_val = metadata_entry.entry_data.value

            # coerce NaN to null
            if isnan(float_val):
                float_val = None

            yield GrapheneEventFloatMetadataEntry(
                label=metadata_entry.label,
                description=metadata_entry.description,
                floatValue=float_val,
            )
        elif isinstance(metadata_entry.entry_data, IntMetadataEntryData):
            # coerce > 32 bit ints to null
            int_val = None
            if MIN_INT <= metadata_entry.entry_data.value <= MAX_INT:
                int_val = metadata_entry.entry_data.value

            yield GrapheneEventIntMetadataEntry(
                label=metadata_entry.label,
                description=metadata_entry.description,
                intValue=int_val,
                # make string representation available to allow for > 32bit int
                intRepr=str(metadata_entry.entry_data.value),
            )
        elif isinstance(metadata_entry.entry_data,
                        DagsterPipelineRunMetadataEntryData):
            yield GrapheneEventPipelineRunMetadataEntry(
                label=metadata_entry.label,
                description=metadata_entry.description,
                runId=metadata_entry.entry_data.run_id,
            )
        elif isinstance(metadata_entry.entry_data,
                        DagsterAssetMetadataEntryData):
            yield GrapheneEventAssetMetadataEntry(
                label=metadata_entry.label,
                description=metadata_entry.description,
                assetKey=metadata_entry.entry_data.asset_key,
            )
        else:
            # skip rest for now
            check.not_implemented(
                "{} unsupported metadata entry for now".format(
                    type(metadata_entry.entry_data)))
Example #26
0
 def get_db_prefix(path, legacy=False):
     check.list_param(path, "path", of_type=str)
     if legacy:
         return ASSET_KEY_STRUCTURED_DELIMITER.join(path)
     return seven.json.dumps(
         path)[:-2]  # strip trailing '"]' from json string
Example #27
0
    def store_events(self, new_events):
        check.list_param(new_events, 'new_events', of_type=EventRecord)

        with self._log_storage_lock:
            self._log_sequence = self._log_sequence.extend(new_events)
Example #28
0
    def __new__(
        cls,
        name=None,
        resource_defs=None,
        logger_defs=None,
        system_storage_defs=None,
        executor_defs=None,
        description=None,
        intermediate_storage_defs=None,
    ):
        from dagster.core.storage.system_storage import (
            default_system_storage_defs,
            default_intermediate_storage_defs,
        )

        from .system_storage import SystemStorageDefinition
        from .intermediate_storage import IntermediateStorageDefinition

        if system_storage_defs is not None and intermediate_storage_defs is None:
            warnings.warn(
                "system_storage_defs are deprecated and will be removed in 0.10.0 "
                "and should be replaced with "
                "intermediate_storage_defs for intermediates and resource_defs for files"
            )

        check.opt_dict_param(resource_defs,
                             "resource_defs",
                             key_type=str,
                             value_type=ResourceDefinition)
        if resource_defs and "asset_store" in resource_defs:
            resource_defs_with_defaults = resource_defs
        else:
            from dagster.core.storage.asset_store import mem_asset_store

            resource_defs_with_defaults = merge_dicts(
                {"asset_store": mem_asset_store}, resource_defs or {})

        return super(ModeDefinition, cls).__new__(
            cls,
            name=check_valid_name(name) if name else DEFAULT_MODE_NAME,
            resource_defs=resource_defs_with_defaults,
            loggers=(check.opt_dict_param(logger_defs,
                                          "logger_defs",
                                          key_type=str,
                                          value_type=LoggerDefinition)
                     or default_loggers()),
            system_storage_defs=check.list_param(
                system_storage_defs
                if system_storage_defs else default_system_storage_defs,
                "system_storage_defs",
                of_type=SystemStorageDefinition,
            ),
            intermediate_storage_defs=check.list_param(
                intermediate_storage_defs if intermediate_storage_defs else
                default_intermediate_storage_defs,
                "intermediate_storage_defs",
                of_type=IntermediateStorageDefinition,
            ),
            executor_defs=check.list_param(
                executor_defs if executor_defs else default_executors,
                "executor_defs",
                of_type=ExecutorDefinition,
            ),
            description=check.opt_str_param(description, "description"),
        )
Example #29
0
 def __init__(self, manager_fn, marks):
     self.manager_fn = check.callable_param(manager_fn, "manager_fn")
     self.marks = check.list_param(marks, "marks")
Example #30
0
def construct_dagster_graphql_k8s_job(job_config, args, job_name, pod_name=None, component=None):
    '''Constructs a Kubernetes Job object for a dagster-graphql invocation.

    Args:
        job_config (DagsterK8sJobConfig): Job configuration to use for constructing the Kubernetes
            Job object.
        args (List[str]): CLI arguments to use with dagster-graphql in this Job.
        job_name (str): The name of the Job. Note that this name must be <= 63 characters in length.
        pod_name (str, optional): The name of the Pod. Note that this name must be <= 63 characters
            in length. Defaults to "<job_name>-pod".
        component (str, optional): The name of the component, used to provide the Job label
            app.kubernetes.io/component. Defaults to None.

    Returns:
        kubernetes.client.V1Job: A Kubernetes Job object.
    '''
    check.inst_param(job_config, 'job_config', DagsterK8sJobConfig)
    check.list_param(args, 'args', of_type=str)
    check.str_param(job_name, 'job_name')
    pod_name = check.opt_str_param(pod_name, 'pod_name', default=job_name + '-pod')
    check.opt_str_param(component, 'component')

    check.invariant(
        len(job_name) <= MAX_K8S_NAME_LEN,
        'job_name is %d in length; Kubernetes Jobs cannot be longer than %d characters.'
        % (len(job_name), MAX_K8S_NAME_LEN),
    )

    check.invariant(
        len(pod_name) <= MAX_K8S_NAME_LEN,
        'job_name is %d in length; Kubernetes Pods cannot be longer than %d characters.'
        % (len(pod_name), MAX_K8S_NAME_LEN),
    )

    # See: https://kubernetes.io/docs/concepts/overview/working-with-objects/common-labels/
    dagster_labels = {
        'app.kubernetes.io/name': 'dagster',
        'app.kubernetes.io/instance': 'dagster',
        'app.kubernetes.io/version': dagster_version,
        'app.kubernetes.io/part-of': 'dagster',
    }

    if component:
        dagster_labels['app.kubernetes.io/component'] = component

    job_container = kubernetes.client.V1Container(
        name=job_name,
        image=job_config.job_image,
        command=['dagster-graphql'],
        args=args,
        image_pull_policy=job_config.image_pull_policy,
        env=[
            kubernetes.client.V1EnvVar(name='DAGSTER_HOME', value=job_config.dagster_home),
            kubernetes.client.V1EnvVar(
                name=DAGSTER_PG_PASSWORD_ENV_VAR,
                value_from=kubernetes.client.V1EnvVarSource(
                    secret_key_ref=kubernetes.client.V1SecretKeySelector(
                        name=job_config.postgres_password_secret, key=DAGSTER_PG_PASSWORD_SECRET_KEY
                    )
                ),
            ),
        ],
        env_from=job_config.env_from_sources,
        volume_mounts=[
            kubernetes.client.V1VolumeMount(
                name='dagster-instance',
                mount_path='{dagster_home}/dagster.yaml'.format(
                    dagster_home=job_config.dagster_home
                ),
                sub_path='dagster.yaml',
            )
        ],
    )

    config_map_volume = kubernetes.client.V1Volume(
        name='dagster-instance',
        config_map=kubernetes.client.V1ConfigMapVolumeSource(name=job_config.instance_config_map),
    )

    template = kubernetes.client.V1PodTemplateSpec(
        metadata=kubernetes.client.V1ObjectMeta(name=pod_name, labels=dagster_labels),
        spec=kubernetes.client.V1PodSpec(
            image_pull_secrets=job_config.image_pull_secrets,
            service_account_name=job_config.service_account_name,
            restart_policy='Never',
            containers=[job_container],
            volumes=[config_map_volume],
        ),
    )

    job = kubernetes.client.V1Job(
        api_version='batch/v1',
        kind='Job',
        metadata=kubernetes.client.V1ObjectMeta(name=job_name, labels=dagster_labels),
        spec=kubernetes.client.V1JobSpec(
            template=template,
            backoff_limit=K8S_JOB_BACKOFF_LIMIT,
            ttl_seconds_after_finished=K8S_JOB_TTL_SECONDS_AFTER_FINISHED,
        ),
    )
    return job