Esempio n. 1
0
def execute_script_file(shell_script_path, output_logging, log, cwd=None, env=None):
    """Execute a shell script file specified by the argument ``shell_command``. The script will be
    invoked via ``subprocess.Popen(['bash', shell_script_path], ...)``.

    In the Popen invocation, ``stdout=PIPE, stderr=STDOUT`` is used, and the combined stdout/stderr
    output is retrieved.

    Args:
        shell_command (str): The shell command to execute
        output_logging (str): The logging mode to use. Supports STREAM, BUFFER, and NONE.
        log (Union[logging.Logger, DagsterLogManager]): Any logger which responds to .info()
        cwd (str, optional): Working directory for the shell command to use. Defaults to the
            temporary path where we store the shell command in a script file.
        env (Dict[str, str], optional): Environment dictionary to pass to ``subprocess.Popen``.
            Unused by default.

    Raises:
        Exception: When an invalid output_logging is selected. Unreachable from solid-based
            invocation since the config system will check output_logging against the config
            enum.

    Returns:
        str: The combined stdout/stderr output of running the shell script.
    """
    check.str_param(shell_script_path, "shell_script_path")
    check.str_param(output_logging, "output_logging")
    check.opt_str_param(cwd, "cwd", default=os.path.dirname(shell_script_path))
    env = check.opt_dict_param(env, "env")

    def pre_exec():
        # Restore default signal disposition and invoke setsid
        for sig in ("SIGPIPE", "SIGXFZ", "SIGXFSZ"):
            if hasattr(signal, sig):
                signal.signal(getattr(signal, sig), signal.SIG_DFL)
        os.setsid()

    with open(shell_script_path, "rb") as f:
        shell_command = six.ensure_str(f.read())

    log.info("Running command:\n{command}".format(command=shell_command))

    # pylint: disable=subprocess-popen-preexec-fn
    sub_process = Popen(
        ["bash", shell_script_path],
        stdout=PIPE,
        stderr=STDOUT,
        cwd=cwd,
        env=env,
        preexec_fn=pre_exec,
    )

    # Will return the string result of reading stdout of the shell command
    output = ""

    if output_logging not in ["STREAM", "BUFFER", "NONE"]:
        raise Exception("Unrecognized output_logging %s" % output_logging)

    # Stream back logs as they are emitted
    if output_logging == "STREAM":
        for raw_line in iter(sub_process.stdout.readline, b""):
            line = six.ensure_str(raw_line)
            log.info(line.rstrip())
            output += line

    sub_process.wait()

    # Collect and buffer all logs, then emit
    if output_logging == "BUFFER":
        output = "".join(
            [six.ensure_str(raw_line) for raw_line in iter(sub_process.stdout.readline, b"")]
        )
        log.info(output)

    # no logging in this case
    elif output_logging == "NONE":
        pass

    log.info("Command exited with return code {retcode}".format(retcode=sub_process.returncode))

    return output, sub_process.returncode
Esempio n. 2
0
 def __init__(self, reload_trigger):
     self.reload_trigger = check.opt_str_param(reload_trigger, 'reload_trigger')
Esempio n. 3
0
def monthly_schedule(
    pipeline_name,
    start_date,
    name=None,
    execution_day_of_month=1,
    execution_time=datetime.time(0, 0),
    tags_fn_for_date=None,
    solid_selection=None,
    mode="default",
    should_execute=None,
    environment_vars=None,
    end_date=None,
    execution_timezone=None,
):
    """Create a schedule that runs monthly.

    The decorated function will be called as the ``run_config_fn`` of the underlying
    :py:class:`~dagster.ScheduleDefinition` and should take a
    :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the environment
    dict for the scheduled execution.

    Args:
        pipeline_name (str): The name of the pipeline to execute when the schedule runs.
        start_date (datetime.datetime): The date from which to run the schedule.
        name (Optional[str]): The name of the schedule to create.
        execution_day_of_month (int): The day of the month on which to run the schedule (must be
            between 0 and 31).
        execution_time (datetime.time): The time at which to execute the schedule.
        tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A
            function that generates tags to attach to the schedules runs. Takes the date of the
            schedule run and returns a dictionary of tags (string key-value pairs).
        solid_selection (Optional[List[str]]): A list of solid subselection (including single
            solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``
        mode (Optional[str]): The pipeline mode in which to execute this schedule.
            (Default: 'default')
        should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at
            schedule execution tie to determine whether a schedule should execute or skip. Takes a
            :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the
            schedule should execute). Defaults to a function that always returns ``True``.
        environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing
            the schedule.
        end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to
            current time.
        execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works
            with DagsterDaemonScheduler, and must be set when using that scheduler.
    """
    check.opt_str_param(name, "name")
    check.inst_param(start_date, "start_date", datetime.datetime)
    check.opt_inst_param(end_date, "end_date", datetime.datetime)
    check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")
    check.opt_nullable_list_param(solid_selection,
                                  "solid_selection",
                                  of_type=str)
    mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)
    check.opt_callable_param(should_execute, "should_execute")
    check.opt_dict_param(environment_vars,
                         "environment_vars",
                         key_type=str,
                         value_type=str)
    check.str_param(pipeline_name, "pipeline_name")
    check.int_param(execution_day_of_month, "execution_day")
    check.inst_param(execution_time, "execution_time", datetime.time)
    check.opt_str_param(execution_timezone, "execution_timezone")

    if (start_date.day != 1 or start_date.hour != 0 or start_date.minute != 0
            or start_date.second != 0):
        warnings.warn(
            "`start_date` must be at the beginning of the first day of the month for a monthly "
            "schedule. Use `execution_day_of_month` and `execution_time` to execute the schedule "
            "at a specific time within the month. For example, to run the schedule at 3AM on the "
            "23rd of each month starting in October, your schedule definition would look like:"
            """
@monthly_schedule(
    start_date=datetime.datetime(2020, 10, 1),
    execution_day_of_month=23,
    execution_time=datetime.time(3, 0)
):
def my_schedule_definition(_):
    ...
""")

    if execution_day_of_month <= 0 or execution_day_of_month > 31:
        raise DagsterInvalidDefinitionError(
            "`execution_day_of_month={}` is not valid for monthly schedule. Execution day must be "
            "between 1 and 31".format(execution_day_of_month))

    cron_schedule = "{minute} {hour} {day} * *".format(
        minute=execution_time.minute,
        hour=execution_time.hour,
        day=execution_day_of_month)

    fmt = DEFAULT_MONTHLY_FORMAT

    execution_time_to_partition_fn = (lambda d: pendulum.instance(d).replace(
        hour=0, minute=0).subtract(months=1, days=execution_day_of_month - 1))

    partition_fn = schedule_partition_range(
        start_date,
        end=end_date,
        cron_schedule=cron_schedule,
        fmt=fmt,
        timezone=execution_timezone,
        execution_time_to_partition_fn=execution_time_to_partition_fn,
    )

    def inner(fn):
        check.callable_param(fn, "fn")

        schedule_name = name or fn.__name__

        tags_fn_for_partition_value = lambda partition: {}
        if tags_fn_for_date:
            tags_fn_for_partition_value = lambda partition: tags_fn_for_date(
                partition.value)

        partition_set = PartitionSetDefinition(
            name="{}_partitions".format(schedule_name),
            pipeline_name=pipeline_name,
            partition_fn=partition_fn,
            run_config_fn_for_partition=lambda partition: fn(partition.value),
            solid_selection=solid_selection,
            tags_fn_for_partition=tags_fn_for_partition_value,
            mode=mode,
        )

        return partition_set.create_schedule_definition(
            schedule_name,
            cron_schedule,
            should_execute=should_execute,
            environment_vars=environment_vars,
            partition_selector=create_offset_partition_selector(
                execution_time_to_partition_fn=execution_time_to_partition_fn),
            execution_timezone=execution_timezone,
        )

    return inner
def execute_pipeline_through_queue(
    handle,
    pipeline_name,
    solid_subset,
    environment_dict,
    mode,
    run_id,
    message_queue,
    reexecution_config,
    step_keys_to_execute,
):
    """
    Execute pipeline using message queue as a transport
    """

    check.opt_str_param(mode, 'mode')

    message_queue.put(ProcessStartedSentinel(os.getpid()))

    run_config = RunConfig(
        run_id,
        mode=mode,
        event_callback=message_queue.put,
        reexecution_config=reexecution_config,
        step_keys_to_execute=step_keys_to_execute,
    )

    if 'execution' not in environment_dict or not environment_dict['execution']:
        environment_dict['execution'] = {
            'in_process': {
                'config': {
                    'raise_on_error': False
                }
            }
        }

    try:
        handle.build_repository_definition()
        pipeline_def = handle.with_pipeline_name(
            pipeline_name).build_pipeline_definition()
    except Exception:  # pylint: disable=broad-except
        repo_error = sys.exc_info()
        message_queue.put(
            MultiprocessingError(
                serializable_error_info_from_exc_info(repo_error)))
        return

    try:
        event_list = []
        for event in execute_pipeline_iterator(
                pipeline_def.build_sub_pipeline(solid_subset),
                environment_dict,
                run_config=run_config):
            # message_queue.put(event)
            event_list.append(event)
        return PipelineExecutionResult(pipeline_def, run_config.run_id,
                                       event_list, lambda: None)
    except Exception:  # pylint: disable=broad-except
        error_info = serializable_error_info_from_exc_info(sys.exc_info())
        message_queue.put(MultiprocessingError(error_info))
    finally:
        message_queue.put(MultiprocessingDone())
        message_queue.close()
Esempio n. 5
0
 def root_config_key_for_mode(self, mode_name):
     check.opt_str_param(mode_name, "mode_name")
     return self.get_mode_def_snap(mode_name if mode_name else self.
                                   get_default_mode_name()).root_config_key
Esempio n. 6
0
 def __init__(self, solid_def, solid_name=None):
     self.solid_def = solid_def
     self.solid_name = check.opt_str_param(solid_name, 'solid_name', solid_def.name)
Esempio n. 7
0
def canonicalize_backcompat_args(new_val, new_arg, old_val, old_arg,
                                 breaking_version, **kwargs):
    """
    Utility for managing backwards compatibility of two related arguments.

    For example if you had an existing function

    def is_new(old_flag):
        return not new_flag

    And you decided you wanted a new function to be:

    def is_new(new_flag):
        return new_flag

    However you want an in between period where either flag is accepted. Use
    canonicalize_backcompat_args to manage that:

    def is_new(old_flag=None, new_flag=None):
        return canonicalize_backcompat_args(
            new_val=new_flag,
            new_arg='new_flag',
            old_val=old_flag,
            old_arg='old_flag',
            breaking_version='0.9.0',
            coerce_old_to_new=lambda val: not val,
        )


    In this example, if the caller sets both new_flag and old_flag, it will fail by throwing
    a CheckError. If the caller sets old_flag, it will run it through the coercion function
    , warn, and then execute.

    canonicalize_backcompat_args returns the value as if *only* new_val were specified
    """
    coerce_old_to_new = kwargs.get("coerce_old_to_new")
    additional_warn_txt = kwargs.get("additional_warn_txt")
    # stacklevel=3 punches up to the caller of canonicalize_backcompat_args
    stacklevel = kwargs.get("stacklevel", 3)

    check.str_param(new_arg, "new_arg")
    check.str_param(old_arg, "old_arg")
    check.opt_callable_param(coerce_old_to_new, "coerce_old_to_new")
    check.opt_str_param(additional_warn_txt, "additional_warn_txt")
    check.opt_int_param(stacklevel, "stacklevel")
    if new_val is not None:
        if old_val is not None:
            check.failed(
                'Do not use deprecated "{old_arg}" now that you are using "{new_arg}".'
                .format(old_arg=old_arg, new_arg=new_arg))
        return new_val
    if old_val is not None:
        warnings.warn(
            '"{old_arg}" is deprecated and will be removed in {breaking_version}, use "{new_arg}" instead.'
            .format(old_arg=old_arg,
                    new_arg=new_arg,
                    breaking_version=breaking_version) +
            ((" " + additional_warn_txt) if additional_warn_txt else ""),
            stacklevel=stacklevel,
        )
        return coerce_old_to_new(old_val) if coerce_old_to_new else old_val

    return new_val
Esempio n. 8
0
    def __init__(
            self,
            solid_defs,
            name=None,
            description=None,
            dependencies=None,
            mode_defs=None,
            preset_defs=None,
            tags=None,
            _parent_pipeline_def=None,  # https://github.com/dagster-io/dagster/issues/2115
    ):
        self._name = check.opt_str_param(name, 'name', '<<unnamed>>')
        self._description = check.opt_str_param(description, 'description')

        mode_definitions = check.opt_list_param(mode_defs,
                                                'mode_defs',
                                                of_type=ModeDefinition)

        if not mode_definitions:
            mode_definitions = [ModeDefinition()]

        self._mode_definitions = mode_definitions

        self._current_level_solid_defs = check.list_param(
            _check_solids_arg(self._name, solid_defs),
            'solid_defs',
            of_type=ISolidDefinition)
        self._tags = validate_tags(tags)

        seen_modes = set()
        for mode_def in mode_definitions:
            if mode_def.name in seen_modes:
                raise DagsterInvalidDefinitionError((
                    'Two modes seen with the name "{mode_name}" in "{pipeline_name}". '
                    'Modes must have unique names.').format(
                        mode_name=mode_def.name, pipeline_name=self._name))
            seen_modes.add(mode_def.name)

        self._dependencies = validate_dependency_dict(dependencies)

        dependency_structure, solid_dict = create_execution_structure(
            self._current_level_solid_defs,
            self._dependencies,
            container_definition=None)

        self._solid_dict = solid_dict
        self._dependency_structure = dependency_structure

        # eager toposort solids to detect cycles
        self.solids_in_topological_order = self._solids_in_topological_order()

        self._dagster_type_dict = construct_dagster_type_dictionary(
            self._current_level_solid_defs)

        self._preset_defs = check.opt_list_param(preset_defs, 'preset_defs',
                                                 PresetDefinition)
        self._preset_dict = {}
        for preset in self._preset_defs:
            if preset.name in self._preset_dict:
                raise DagsterInvalidDefinitionError((
                    'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". '
                    'PresetDefinitions must have unique names.').format(
                        name=preset.name, pipeline_name=self._name))
            if preset.mode not in seen_modes:
                raise DagsterInvalidDefinitionError(
                    ('PresetDefinition "{name}" in "{pipeline_name}" '
                     'references mode "{mode}" which is not defined.').format(
                         name=preset.name,
                         pipeline_name=self._name,
                         mode=preset.mode))
            self._preset_dict[preset.name] = preset

        # Validate solid resource dependencies
        _validate_resource_dependencies(self._mode_definitions,
                                        self._current_level_solid_defs)

        # Validate unsatisfied inputs can be materialized from config
        _validate_inputs(self._dependency_structure, self._solid_dict)

        self._all_solid_defs = _build_all_solid_defs(
            self._current_level_solid_defs)
        self._parent_pipeline_def = check.opt_inst_param(
            _parent_pipeline_def, '_parent_pipeline_def', PipelineDefinition)
        self._cached_run_config_schemas = {}
        self._cached_external_pipeline = None
Esempio n. 9
0
    def __init__(
        self,
        instance_config_map,
        dagster_home,
        postgres_password_secret,
        load_incluster_config=True,
        kubeconfig_file=None,
        broker=None,
        backend=None,
        include=None,
        config_source=None,
        retries=None,
        inst_data=None,
        k8s_client_batch_api=None,
        env_config_maps=None,
        env_secrets=None,
        volume_mounts=None,
        volumes=None,
        service_account_name=None,
        image_pull_policy=None,
        image_pull_secrets=None,
        labels=None,
        fail_pod_on_run_failure=None,
    ):
        self._inst_data = check.opt_inst_param(inst_data, "inst_data",
                                               ConfigurableClassData)

        if load_incluster_config:
            check.invariant(
                kubeconfig_file is None,
                "`kubeconfig_file` is set but `load_incluster_config` is True.",
            )
            kubernetes.config.load_incluster_config()
        else:
            check.opt_str_param(kubeconfig_file, "kubeconfig_file")
            kubernetes.config.load_kube_config(kubeconfig_file)

        self._fixed_batch_api = k8s_client_batch_api

        self.instance_config_map = check.str_param(instance_config_map,
                                                   "instance_config_map")
        self.dagster_home = check.str_param(dagster_home, "dagster_home")
        self.postgres_password_secret = check.str_param(
            postgres_password_secret, "postgres_password_secret")
        self.broker = check.opt_str_param(broker, "broker")
        self.backend = check.opt_str_param(backend, "backend")
        self.include = check.opt_list_param(include, "include")
        self.config_source = check.opt_dict_param(config_source,
                                                  "config_source")

        retries = check.opt_dict_param(retries, "retries") or {"enabled": {}}
        self.retries = RetryMode.from_config(retries)

        self._env_config_maps = check.opt_list_param(env_config_maps,
                                                     "env_config_maps",
                                                     of_type=str)
        self._env_secrets = check.opt_list_param(env_secrets,
                                                 "env_secrets",
                                                 of_type=str)

        self._volume_mounts = check.opt_list_param(volume_mounts,
                                                   "volume_mounts")
        self._volumes = check.opt_list_param(volumes, "volumes")

        self._service_account_name = check.opt_str_param(
            service_account_name, "service_account_name")
        self._image_pull_policy = check.opt_str_param(image_pull_policy,
                                                      "image_pull_policy",
                                                      "IfNotPresent")
        self._image_pull_secrets = check.opt_list_param(image_pull_secrets,
                                                        "image_pull_secrets",
                                                        of_type=dict)
        self._labels = check.opt_dict_param(labels,
                                            "labels",
                                            key_type=str,
                                            value_type=str)
        self._fail_pod_on_run_failure = check.opt_bool_param(
            fail_pod_on_run_failure, "fail_pod_on_run_failure")

        super().__init__()
Esempio n. 10
0
 def uri_for_key(self, key, protocol=None):
     check.str_param(key, "key")
     protocol = check.opt_str_param(protocol, "protocol", default="gs://")
     return protocol + self.bucket + "/" + "{key}".format(key=key)
Esempio n. 11
0
 def __new__(cls, run_id, tags):
     return super(ExecutionMetadata, cls).__new__(
         cls,
         check.opt_str_param(run_id, 'run_id'),
         check.dict_param(tags, 'tags', key_type=str, value_type=str),
     )
Esempio n. 12
0
def _make_airflow_dag(
    handle,
    pipeline_name,
    environment_dict=None,
    mode=None,
    dag_id=None,
    dag_description=None,
    dag_kwargs=None,
    op_kwargs=None,
    operator=DagsterPythonOperator,
):
    check.inst_param(handle, 'handle', ExecutionTargetHandle)
    check.str_param(pipeline_name, 'pipeline_name')
    environment_dict = check.opt_dict_param(environment_dict,
                                            'environment_dict',
                                            key_type=str)
    mode = check.opt_str_param(mode, 'mode')

    # Only used for Airflow; internally we continue to use pipeline.name
    dag_id = check.opt_str_param(dag_id, 'dag_id',
                                 _rename_for_airflow(pipeline_name))

    dag_description = check.opt_str_param(dag_description, 'dag_description',
                                          _make_dag_description(pipeline_name))
    check.subclass_param(operator, 'operator', BaseOperator)

    # black 18.9b0 doesn't support py27-compatible formatting of the below invocation (omitting
    # the trailing comma after **check.opt_dict_param...) -- black 19.3b0 supports multiple python
    # versions, but currently doesn't know what to do with from __future__ import print_function --
    # see https://github.com/ambv/black/issues/768
    # fmt: off
    dag_kwargs = dict({'default_args': DEFAULT_ARGS},
                      **check.opt_dict_param(dag_kwargs,
                                             'dag_kwargs',
                                             key_type=str))
    # fmt: on

    op_kwargs = check.opt_dict_param(op_kwargs, 'op_kwargs', key_type=str)

    dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs)

    pipeline = handle.build_pipeline_definition()

    if mode is None:
        mode = pipeline.get_default_mode_name()

    execution_plan = create_execution_plan(pipeline,
                                           environment_dict,
                                           run_config=RunConfig(mode=mode))

    tasks = {}

    coalesced_plan = coalesce_execution_steps(execution_plan)

    for solid_handle, solid_steps in coalesced_plan.items():

        step_keys = [step.key for step in solid_steps]

        # We separately construct the Airflow operators here with the appropriate args, because if
        # Airflow gets extraneous args/kwargs it emits a warning every time it parses the DAG (and
        # future Airflow versions will mark this a failure).
        # see https://github.com/ambv/black/issues/768
        # fmt: off
        if operator == DagsterPythonOperator:
            task = operator(handle=handle,
                            pipeline_name=pipeline_name,
                            environment_dict=environment_dict,
                            mode=mode,
                            task_id=solid_handle,
                            step_keys=step_keys,
                            dag=dag,
                            **op_kwargs)
        else:
            task = operator(pipeline_name=pipeline_name,
                            environment_dict=environment_dict,
                            mode=mode,
                            task_id=solid_handle,
                            step_keys=step_keys,
                            dag=dag,
                            **op_kwargs)
        # fmt: on

        tasks[solid_handle] = task

        for solid_step in solid_steps:
            for step_input in solid_step.step_inputs:
                for key in step_input.dependency_keys:
                    prev_solid_handle = execution_plan.get_step_by_key(
                        key).solid_handle.to_string()
                    if solid_handle != prev_solid_handle:
                        tasks[prev_solid_handle].set_downstream(task)

    return (dag,
            [tasks[solid_handle] for solid_handle in coalesced_plan.keys()])
Esempio n. 13
0
    def __init__(
        self,
        type_check_fn,
        key=None,
        name=None,
        is_builtin=False,
        description=None,
        input_hydration_config=None,
        output_materialization_config=None,
        serialization_strategy=None,
        auto_plugins=None,
    ):
        check.opt_str_param(key, 'key')
        check.opt_str_param(name, 'name')

        check.invariant(not (name is None and key is None),
                        'Must set key or name')

        if name is None:
            check.param_invariant(
                bool(key),
                'key',
                'If name is not provided, must provide key.',
            )
            self.key, self.name = key, name
        elif key is None:
            check.param_invariant(
                bool(name),
                'name',
                'If key is not provided, must provide name.',
            )
            self.key, self.name = name, name
        else:
            check.invariant(key and name)
            self.key, self.name = key, name

        self.description = check.opt_str_param(description, 'description')
        self.input_hydration_config = check.opt_inst_param(
            input_hydration_config, 'input_hydration_config',
            InputHydrationConfig)
        self.output_materialization_config = check.opt_inst_param(
            output_materialization_config,
            'output_materialization_config',
            OutputMaterializationConfig,
        )
        self.serialization_strategy = check.opt_inst_param(
            serialization_strategy,
            'serialization_strategy',
            SerializationStrategy,
            PickleSerializationStrategy(),
        )

        self._type_check_fn = check.callable_param(type_check_fn,
                                                   'type_check_fn')

        auto_plugins = check.opt_list_param(auto_plugins,
                                            'auto_plugins',
                                            of_type=type)

        check.param_invariant(
            all(
                issubclass(auto_plugin_type, TypeStoragePlugin)
                for auto_plugin_type in auto_plugins),
            'auto_plugins',
        )

        self.auto_plugins = auto_plugins

        self.is_builtin = check.bool_param(is_builtin, 'is_builtin')
        check.invariant(
            self.display_name is not None,
            'All types must have a valid display name, got None for key {}'.
            format(key),
        )
Esempio n. 14
0
def define_python_dagster_type(
    python_type,
    name=None,
    description=None,
    input_hydration_config=None,
    output_materialization_config=None,
    serialization_strategy=None,
    auto_plugins=None,
):
    '''Core machinery for defining a Dagster type corresponding to an existing python type.

    Users should generally use the :py:func:`@dagster_type` decorator or :py:func:`as_dagster_type`,
    both of which defer to this function.

    Args:
        python_type (cls): The python type to wrap as a Dagster type.
        name (Optional[str]): Name of the new Dagster type. If ``None``, the name (``__name__``) of
            the ``python_type`` will be used.
        description (Optional[str]): A user-readable description of the type.
        input_hydration_config (Optional[InputHydrationConfig]): An instance of a class constructed
            using the :py:func:`@input_hydration_config <dagster.InputHydrationConfig>` decorator
            that can map config data to a value of this type.
        output_materialization_config (Optiona[OutputMaterializationConfig]): An instance of a class
            constructed using the
            :py:func:`@output_materialization_config <dagster.output_materialization_config>`
            decorator that can persist values of this type.
        serialization_strategy (Optional[SerializationStrategy]): An instance of a class that
            inherits from :py:class:`SerializationStrategy`. The default strategy for serializing
            this value when automatically persisting it between execution steps. You should set
            this value if the ordinary serialization machinery (e.g., pickle) will not be adequate
            for this type.
        auto_plugins (Optional[List[TypeStoragePlugin]]): If types must be serialized differently
            depending on the storage being used for intermediates, they should specify this
            argument. In these cases the serialization_strategy argument is not sufficient because
            serialization requires specialized API calls, e.g. to call an S3 API directly instead
            of using a generic file object. See ``dagster_pyspark.DataFrame`` for an example.
    '''

    check.type_param(python_type, 'python_type')
    check.opt_str_param(name, 'name', python_type.__name__)
    check.opt_str_param(description, 'description')
    check.opt_inst_param(input_hydration_config, 'input_hydration_config',
                         InputHydrationConfig)
    check.opt_inst_param(output_materialization_config,
                         'output_materialization_config',
                         OutputMaterializationConfig)
    check.opt_inst_param(
        serialization_strategy,
        'serialization_strategy',
        SerializationStrategy,
        default=PickleSerializationStrategy(),
    )

    auto_plugins = check.opt_list_param(auto_plugins,
                                        'auto_plugins',
                                        of_type=type)
    check.param_invariant(
        all(
            issubclass(auto_plugin_type, TypeStoragePlugin)
            for auto_plugin_type in auto_plugins),
        'auto_plugins',
    )

    return PythonObjectType(
        python_type=python_type,
        name=name,
        description=description,
        input_hydration_config=input_hydration_config,
        output_materialization_config=output_materialization_config,
        serialization_strategy=serialization_strategy,
        auto_plugins=auto_plugins,
    )
Esempio n. 15
0
 def __init__(self, config_value, python_value=None, description=None):
     self.config_value = check.str_param(config_value, 'config_value')
     self.python_value = config_value if python_value is None else python_value
     self.description = check.opt_str_param(description, 'description')
Esempio n. 16
0
def hourly_schedule(
    pipeline_name,
    start_date,
    name=None,
    execution_time=datetime.time(0, 0),
    tags_fn_for_date=None,
    solid_selection=None,
    mode="default",
    should_execute=None,
    environment_vars=None,
    end_date=None,
):
    """Create a schedule that runs hourly.

    The decorated function will be called as the ``run_config_fn`` of the underlying
    :py:class:`~dagster.ScheduleDefinition` and should take a
    :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the environment
    dict for the scheduled execution.

    Args:
        pipeline_name (str): The name of the pipeline to execute when the schedule runs.
        start_date (datetime.datetime): The date from which to run the schedule.
        name (Optional[str]): The name of the schedule to create. By default, this will be the name
            of the decorated function.
        execution_time (datetime.time): The time at which to execute the schedule. Only the minutes
            component will be respected -- the hour should be 0, and will be ignored if it is not 0.
        tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A
            function that generates tags to attach to the schedules runs. Takes the date of the
            schedule run and returns a dictionary of tags (string key-value pairs).
        solid_selection (Optional[List[str]]): A list of solid subselection (including single
            solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``
        mode (Optional[str]): The pipeline mode in which to execute this schedule.
            (Default: 'default')
        should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at
            schedule execution tie to determine whether a schedule should execute or skip. Takes a
            :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the
            schedule should execute). Defaults to a function that always returns ``True``.
        environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing
            the schedule.
        end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to
            current time.
    """
    check.opt_str_param(name, "name")
    check.inst_param(start_date, "start_date", datetime.datetime)
    check.opt_inst_param(end_date, "end_date", datetime.datetime)
    check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")
    check.opt_nullable_list_param(solid_selection,
                                  "solid_selection",
                                  of_type=str)
    mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)
    check.opt_callable_param(should_execute, "should_execute")
    check.opt_dict_param(environment_vars,
                         "environment_vars",
                         key_type=str,
                         value_type=str)
    check.str_param(pipeline_name, "pipeline_name")
    check.inst_param(execution_time, "execution_time", datetime.time)

    if execution_time.hour != 0:
        warnings.warn(
            "Hourly schedule {schedule_name} created with:\n"
            "\tschedule_time=datetime.time(hour={hour}, minute={minute}, ...)."
            "Since this is a hourly schedule, the hour parameter will be ignored and the schedule "
            "will run on the {minute} mark for the previous hour interval. Replace "
            "datetime.time(hour={hour}, minute={minute}, ...) with "
            "datetime.time(minute={minute}, ...) to fix this warning.")

    cron_schedule = "{minute} * * * *".format(minute=execution_time.minute)

    partition_fn = date_partition_range(start_date,
                                        end=end_date,
                                        delta=datetime.timedelta(hours=1),
                                        fmt="%Y-%m-%d-%H:%M")

    def inner(fn):
        check.callable_param(fn, "fn")

        schedule_name = name or fn.__name__

        tags_fn_for_partition_value = lambda partition: {}
        if tags_fn_for_date:
            tags_fn_for_partition_value = lambda partition: tags_fn_for_date(
                partition.value)

        partition_set = PartitionSetDefinition(
            name="{}_partitions".format(schedule_name),
            pipeline_name=pipeline_name,
            partition_fn=partition_fn,
            run_config_fn_for_partition=lambda partition: fn(partition.value),
            solid_selection=solid_selection,
            tags_fn_for_partition=tags_fn_for_partition_value,
            mode=mode,
        )

        return partition_set.create_schedule_definition(
            schedule_name,
            cron_schedule,
            should_execute=should_execute,
            environment_vars=environment_vars,
        )

    return inner
Esempio n. 17
0
def create_dagster_pandas_dataframe_type(
    name,
    description=None,
    columns=None,
    event_metadata_fn=None,
    dataframe_constraints=None,
    input_hydration_config=None,
    output_materialization_config=None,
):
    """
    Constructs a custom pandas dataframe dagster type.

    Args:
        name (str): Name of the dagster pandas type.
        description (Optional[str]): A markdown-formatted string, displayed in tooling.
        columns (Optional[List[PandasColumn]]): A list of :py:class:`~dagster.PandasColumn` objects
            which express dataframe column schemas and constraints.
        event_metadata_fn (Optional[func]): A callable which takes your dataframe and returns a list of EventMetadata
            which allow you to express things like summary statistics during runtime.
        dataframe_constraints (Optional[List[DataFrameConstraint]]): A list of objects that inherit from
            :py:class:`~dagster.DataFrameConstraint`. This allows you to express dataframe-level constraints.
        input_hydration_config (Optional[InputHydrationConfig]): An instance of a class that
            inherits from :py:class:`~dagster.InputHydrationConfig`. If None, we will default
            to using the `dataframe_input_schema` input_hydration_config.
        output_materialization_config (Optional[OutputMaterializationConfig]): An instance of a class
            that inherits from :py:class:`~dagster.OutputMaterializationConfig`. If None, we will
            default to using the `dataframe_output_schema` output_materialization_config.
    """
    # We allow for the plugging in of input_hydration_config/output_materialization_configs so that
    # Users can hydrate and persist their custom dataframes via configuration their own way if the default
    # configs don't suffice. This is purely optional.
    check.str_param(name, 'name')
    event_metadata_fn = check.opt_callable_param(event_metadata_fn,
                                                 'event_metadata_fn')
    description = create_dagster_pandas_dataframe_description(
        check.opt_str_param(description, 'description', default=''),
        check.opt_list_param(columns, 'columns', of_type=PandasColumn),
    )

    def _dagster_type_check(_, value):
        if not isinstance(value, pd.DataFrame):
            return TypeCheck(
                success=False,
                description=
                'Must be a pandas.DataFrame. Got value of type. {type_name}'.
                format(type_name=type(value).__name__),
            )

        try:
            validate_constraints(value,
                                 pandas_columns=columns,
                                 dataframe_constraints=dataframe_constraints)
        except ConstraintViolationException as e:
            return TypeCheck(success=False, description=str(e))

        return TypeCheck(
            success=True,
            metadata_entries=_execute_summary_stats(
                name, value, event_metadata_fn) if event_metadata_fn else None,
        )

    return DagsterType(
        name=name,
        type_check_fn=_dagster_type_check,
        input_hydration_config=input_hydration_config
        if input_hydration_config else dataframe_input_schema,
        output_materialization_config=output_materialization_config
        if output_materialization_config else dataframe_output_schema,
        description=description,
    )
Esempio n. 18
0
def monthly_schedule(
    pipeline_name,
    start_date,
    name=None,
    execution_day_of_month=1,
    execution_time=datetime.time(0, 0),
    tags_fn_for_date=None,
    solid_selection=None,
    mode="default",
    should_execute=None,
    environment_vars=None,
    end_date=None,
):
    """Create a schedule that runs monthly.

    The decorated function will be called as the ``run_config_fn`` of the underlying
    :py:class:`~dagster.ScheduleDefinition` and should take a
    :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the environment
    dict for the scheduled execution.

    Args:
        pipeline_name (str): The name of the pipeline to execute when the schedule runs.
        start_date (datetime.datetime): The date from which to run the schedule.
        name (Optional[str]): The name of the schedule to create.
        execution_day_of_month (int): The day of the month on which to run the schedule (must be
            between 0 and 31).
        execution_time (datetime.time): The time at which to execute the schedule.
        tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A
            function that generates tags to attach to the schedules runs. Takes the date of the
            schedule run and returns a dictionary of tags (string key-value pairs).
        solid_selection (Optional[List[str]]): A list of solid subselection (including single
            solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``
        mode (Optional[str]): The pipeline mode in which to execute this schedule.
            (Default: 'default')
        should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at
            schedule execution tie to determine whether a schedule should execute or skip. Takes a
            :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the
            schedule should execute). Defaults to a function that always returns ``True``.
        environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing
            the schedule.
        end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to
            current time.
    """
    check.opt_str_param(name, "name")
    check.inst_param(start_date, "start_date", datetime.datetime)
    check.opt_inst_param(end_date, "end_date", datetime.datetime)
    check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")
    check.opt_nullable_list_param(solid_selection,
                                  "solid_selection",
                                  of_type=str)
    mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)
    check.opt_callable_param(should_execute, "should_execute")
    check.opt_dict_param(environment_vars,
                         "environment_vars",
                         key_type=str,
                         value_type=str)
    check.str_param(pipeline_name, "pipeline_name")
    check.int_param(execution_day_of_month, "execution_day")
    check.inst_param(execution_time, "execution_time", datetime.time)

    if execution_day_of_month <= 0 or execution_day_of_month > 31:
        raise DagsterInvalidDefinitionError(
            "`execution_day_of_month={}` is not valid for monthly schedule. Execution day must be "
            "between 1 and 31".format(execution_day_of_month))

    cron_schedule = "{minute} {hour} {day} * *".format(
        minute=execution_time.minute,
        hour=execution_time.hour,
        day=execution_day_of_month)

    partition_fn = date_partition_range(start_date,
                                        end=end_date,
                                        delta=relativedelta(months=1),
                                        fmt="%Y-%m")

    def inner(fn):
        check.callable_param(fn, "fn")

        schedule_name = name or fn.__name__

        tags_fn_for_partition_value = lambda partition: {}
        if tags_fn_for_date:
            tags_fn_for_partition_value = lambda partition: tags_fn_for_date(
                partition.value)

        partition_set = PartitionSetDefinition(
            name="{}_partitions".format(schedule_name),
            pipeline_name=pipeline_name,
            partition_fn=partition_fn,
            run_config_fn_for_partition=lambda partition: fn(partition.value),
            solid_selection=solid_selection,
            tags_fn_for_partition=tags_fn_for_partition_value,
            mode=mode,
        )

        return partition_set.create_schedule_definition(
            schedule_name,
            cron_schedule,
            should_execute=should_execute,
            environment_vars=environment_vars,
        )

    return inner
Esempio n. 19
0
    def __init__(
        self,
        solid_defs,
        name=None,
        description=None,
        dependencies=None,
        mode_defs=None,
        preset_defs=None,
        selector=None,
    ):
        self._name = check.opt_str_param(name, 'name', '<<unnamed>>')
        self._description = check.opt_str_param(description, 'description')

        mode_definitions = check.opt_list_param(mode_defs,
                                                'mode_defs',
                                                of_type=ModeDefinition)

        if not mode_definitions:
            mode_definitions = [ModeDefinition()]

        self._mode_definitions = mode_definitions

        self._current_level_solid_defs = check.list_param(
            _check_solids_arg(self._name, solid_defs),
            'solid_defs',
            of_type=ISolidDefinition)

        seen_modes = set()
        for mode_def in mode_definitions:
            if mode_def.name in seen_modes:
                raise DagsterInvalidDefinitionError((
                    'Two modes seen with the name "{mode_name}" in "{pipeline_name}". '
                    'Modes must have unique names.').format(
                        mode_name=mode_def.name, pipeline_name=self._name))
            seen_modes.add(mode_def.name)

        self._dependencies = validate_dependency_dict(dependencies)

        dependency_structure, pipeline_solid_dict = create_execution_structure(
            self._current_level_solid_defs,
            self._dependencies,
            container_definition=None)

        self._solid_dict = pipeline_solid_dict
        self._dependency_structure = dependency_structure

        self._runtime_type_dict = construct_runtime_type_dictionary(
            self._current_level_solid_defs)

        self._preset_defs = check.opt_list_param(preset_defs, 'preset_defs',
                                                 PresetDefinition)
        self._preset_dict = {}
        for preset in self._preset_defs:
            if preset.name in self._preset_dict:
                raise DagsterInvalidDefinitionError((
                    'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". '
                    'PresetDefinitions must have unique names.').format(
                        name=preset.name, pipeline_name=self._name))
            if preset.mode not in seen_modes:
                raise DagsterInvalidDefinitionError(
                    ('PresetDefinition "{name}" in "{pipeline_name}" '
                     'references mode "{mode}" which is not defined.').format(
                         name=preset.name,
                         pipeline_name=self._name,
                         mode=preset.mode))
            self._preset_dict[preset.name] = preset

        # Validate solid resource dependencies
        _validate_resource_dependencies(self._mode_definitions,
                                        self._current_level_solid_defs)

        # Validate unsatisfied inputs can be materialized from config
        _validate_inputs(self._dependency_structure, self._solid_dict)

        self._all_solid_defs = {}
        for current_level_solid_def in self._current_level_solid_defs:
            for solid_def in current_level_solid_def.iterate_solid_defs():
                self._all_solid_defs[solid_def.name] = solid_def

        self._selector = check.opt_inst_param(selector, 'selector',
                                              ExecutionSelector,
                                              ExecutionSelector(self._name))
Esempio n. 20
0
 def __new__(cls, solid_name, output_name=None):
     return super(OutputPointer, cls).__new__(
         cls,
         check.str_param(solid_name, "solid_name"),
         check.opt_str_param(output_name, "output_name", DEFAULT_OUTPUT),
     )
Esempio n. 21
0
    def create_schedule_definition(
        self,
        schedule_name,
        cron_schedule,
        should_execute=None,
        partition_selector=last_partition,
        environment_vars=None,
        execution_timezone=None,
    ):
        """Create a ScheduleDefinition from a PartitionSetDefinition.

        Arguments:
            schedule_name (str): The name of the schedule.
            cron_schedule (str): A valid cron string for the schedule
            should_execute (Optional[function]): Function that runs at schedule execution time that
            determines whether a schedule should execute. Defaults to a function that always returns
            ``True``.
            partition_selector (Callable[ScheduleExecutionContext, PartitionSetDefinition],
            Partition): A partition selector for the schedule.
            environment_vars (Optional[dict]): The environment variables to set for the schedule.
            execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works
                with DagsterCommandLineScheduler, and must be set when using that scheduler.

        Returns:
            ScheduleDefinition: The generated ScheduleDefinition for the partition selector
        """

        check.str_param(schedule_name, "schedule_name")
        check.str_param(cron_schedule, "cron_schedule")
        check.opt_callable_param(should_execute, "should_execute")
        check.opt_dict_param(environment_vars,
                             "environment_vars",
                             key_type=str,
                             value_type=str)
        check.callable_param(partition_selector, "partition_selector")
        check.opt_str_param(execution_timezone, "execution_timezone")

        def _should_execute_wrapper(context):
            check.inst_param(context, "context", ScheduleExecutionContext)
            selected_partition = partition_selector(context, self)

            if not selected_partition or not selected_partition.name in self.get_partition_names(
            ):
                return False
            elif not should_execute:
                return True
            else:
                return should_execute(context)

        def _run_config_fn_wrapper(context):
            check.inst_param(context, "context", ScheduleExecutionContext)
            selected_partition = partition_selector(context, self)
            if not selected_partition or not selected_partition.name in self.get_partition_names(
            ):
                raise DagsterInvariantViolationError(
                    "The partition selection function `{selector}` did not return "
                    "a partition from PartitionSet {partition_set}".format(
                        selector=getattr(partition_selector, "__name__",
                                         repr(partition_selector)),
                        partition_set=self.name,
                    ))

            return self.run_config_for_partition(selected_partition)

        def _tags_fn_wrapper(context):
            check.inst_param(context, "context", ScheduleExecutionContext)
            selected_partition = partition_selector(context, self)
            if not selected_partition:
                raise DagsterInvariantViolationError(
                    "The partition selection function `{selector}` did not return "
                    "a partition from PartitionSet {partition_set}".format(
                        selector=getattr(partition_selector, "__name__",
                                         repr(partition_selector)),
                        partition_set=self.name,
                    ))

            return self.tags_for_partition(selected_partition)

        return PartitionScheduleDefinition(
            name=schedule_name,
            cron_schedule=cron_schedule,
            pipeline_name=self.pipeline_name,
            run_config_fn=_run_config_fn_wrapper,
            tags_fn=_tags_fn_wrapper,
            solid_selection=self.solid_selection,
            mode=self.mode,
            should_execute=_should_execute_wrapper,
            environment_vars=environment_vars,
            partition_set=self,
            execution_timezone=execution_timezone,
        )
Esempio n. 22
0
def create_dagster_pandas_dataframe_type(
    name,
    description=None,
    columns=None,
    event_metadata_fn=None,
    dataframe_constraints=None,
    loader=None,
    materializer=None,
    input_hydration_config=None,
    output_materialization_config=None,
):
    """
    Constructs a custom pandas dataframe dagster type.

    Args:
        name (str): Name of the dagster pandas type.
        description (Optional[str]): A markdown-formatted string, displayed in tooling.
        columns (Optional[List[PandasColumn]]): A list of :py:class:`~dagster.PandasColumn` objects
            which express dataframe column schemas and constraints.
        event_metadata_fn (Optional[func]): A callable which takes your dataframe and returns a list of EventMetadata
            which allow you to express things like summary statistics during runtime.
        dataframe_constraints (Optional[List[DataFrameConstraint]]): A list of objects that inherit from
            :py:class:`~dagster.DataFrameConstraint`. This allows you to express dataframe-level constraints.
        loader (Optional[DagsterTypeLoader]): An instance of a class that
            inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default
            to using `dataframe_loader`.
        materializer (Optional[DagsterTypeMaterializer]): An instance of a class
            that inherits from :py:class:`~dagster.DagsterTypeMaterializer`. If None, we will
            default to using `dataframe_materializer`.
    """
    # We allow for the plugging in of dagster_type_loaders/materializers so that
    # Users can load and matrerialize their custom dataframes via configuration their own way if the default
    # configs don't suffice. This is purely optional.
    check.str_param(name, "name")
    event_metadata_fn = check.opt_callable_param(event_metadata_fn,
                                                 "event_metadata_fn")
    description = create_dagster_pandas_dataframe_description(
        check.opt_str_param(description, "description", default=""),
        check.opt_list_param(columns, "columns", of_type=PandasColumn),
    )

    def _dagster_type_check(_, value):
        if not isinstance(value, pd.DataFrame):
            return TypeCheck(
                success=False,
                description=
                "Must be a pandas.DataFrame. Got value of type. {type_name}".
                format(type_name=type(value).__name__),
            )

        try:
            validate_constraints(value,
                                 pandas_columns=columns,
                                 dataframe_constraints=dataframe_constraints)
        except ConstraintViolationException as e:
            return TypeCheck(success=False, description=str(e))

        return TypeCheck(
            success=True,
            metadata_entries=_execute_summary_stats(
                name, value, event_metadata_fn) if event_metadata_fn else None,
        )

    loader_ = canonicalize_backcompat_args(
        loader,
        "loader",
        input_hydration_config,
        "input_hydration_config",
        "0.10.0",
    )
    materializer_ = canonicalize_backcompat_args(
        materializer,
        "materializer",
        output_materialization_config,
        "output_materialization_config",
        "0.10.0",
    )

    return DagsterType(
        name=name,
        type_check_fn=_dagster_type_check,
        loader=loader_ if loader_ else dataframe_loader,
        materializer=materializer_
        if materializer_ else dataframe_materializer,
        description=description,
    )
Esempio n. 23
0
 def __new__(cls, name, alias=None):
     name = check.str_param(name, 'name')
     alias = check.opt_str_param(alias, 'alias')
     return super(cls, SolidInstance).__new__(cls, name, alias)
Esempio n. 24
0
def create_structured_dataframe_type(
    name,
    description=None,
    columns_validator=None,
    columns_aggregate_validator=None,
    dataframe_validator=None,
    loader=None,
    materializer=None,
    input_hydration_config=None,
    output_materialization_config=None,
):
    """

    Args:
        name (str): the name of the new type
        description (Optional[str]): the description of the new type
        columns_validator (Optional[Union[ColumnConstraintWithMetadata, MultiColumnConstraintWithMetadata]]):
                    what column-level row by row validation you want to have applied.
                    Leave empty for no column-level row by row validation.
        columns_aggregate_validator (Optional[Union[ColumnAggregateConstraintWithMetadata,
                                    MultiAggregateConstraintWithMetadata]]):
                    what column-level aggregate validation you want to have applied,
                    Leave empty for no column-level aggregate validation.
        dataframe_validator (Optional[Union[ConstraintWithMetadata, MultiConstraintWithMetadata]]):
                    what dataframe-wide validation you want to have applied.
                    Leave empty for no dataframe-wide validation.
        loader (Optional[DagsterTypeLoader]): An instance of a class that
            inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default
            to using `dataframe_loader`.
        materializer (Optional[DagsterTypeMaterializer]): An instance of a class
            that inherits from :py:class:`~dagster.DagsterTypeMaterializer`. If None, we will
            default to using `dataframe_materializer`.

    Returns:
        a DagsterType with the corresponding name and packaged validation.

    """
    def _dagster_type_check(_, value):
        if not isinstance(value, pd.DataFrame):
            return TypeCheck(
                success=False,
                description=
                "Must be a pandas.DataFrame. Got value of type. {type_name}".
                format(type_name=type(value).__name__),
            )
        individual_result_dict = {}

        if dataframe_validator is not None:
            individual_result_dict["dataframe"] = dataframe_validator.validate(
                value)
        if columns_validator is not None:
            individual_result_dict["columns"] = columns_validator.validate(
                value)

        if columns_aggregate_validator is not None:
            individual_result_dict[
                "column-aggregates"] = columns_aggregate_validator.validate(
                    value)

        typechecks_succeeded = True
        metadata = []
        overall_description = "Failed Constraints: {}"
        constraint_clauses = []
        for key, result in individual_result_dict.items():
            result_val = result.success
            if result_val:
                continue
            typechecks_succeeded = typechecks_succeeded and result_val
            result_dict = result.metadata_entries[0].entry_data.data
            metadata.append(
                EventMetadataEntry.json(
                    result_dict,
                    "{}-constraint-metadata".format(key),
                ))
            constraint_clauses.append("{} failing constraints, {}".format(
                key, result.description))
        # returns aggregates, then column, then dataframe
        return TypeCheck(
            success=typechecks_succeeded,
            description=overall_description.format(constraint_clauses),
            metadata_entries=sorted(metadata, key=lambda x: x.label),
        )

    description = check.opt_str_param(description, "description", default="")
    loader_ = canonicalize_backcompat_args(
        loader,
        "loader",
        input_hydration_config,
        "input_hydration_config",
        "0.10.0",
    )
    materializer_ = canonicalize_backcompat_args(
        materializer,
        "materializer",
        output_materialization_config,
        "output_materialization_config",
        "0.10.0",
    )
    return DagsterType(
        name=name,
        type_check_fn=_dagster_type_check,
        loader=loader_ if loader_ else dataframe_loader,
        materializer=materializer_
        if materializer_ else dataframe_materializer,
        description=description,
    )
Esempio n. 25
0
 def __new__(cls, value=None, name=None):
     return super(Partition, cls).__new__(cls,
                                          name=check.opt_str_param(
                                              name, 'name', str(value)),
                                          value=value)
Esempio n. 26
0
    def __init__(
        self,
        type_check_fn,
        key=None,
        name=None,
        is_builtin=False,
        description=None,
        loader=None,
        materializer=None,
        serialization_strategy=None,
        auto_plugins=None,
        required_resource_keys=None,
        kind=DagsterTypeKind.REGULAR,
    ):
        check.opt_str_param(key, "key")
        check.opt_str_param(name, "name")

        check.invariant(not (name is None and key is None),
                        "Must set key or name")

        if name is None:
            check.param_invariant(
                bool(key),
                "key",
                "If name is not provided, must provide key.",
            )
            self.key, self._name = key, None
        elif key is None:
            check.param_invariant(
                bool(name),
                "name",
                "If key is not provided, must provide name.",
            )
            self.key, self._name = name, name
        else:
            check.invariant(key and name)
            self.key, self._name = key, name

        self.description = check.opt_str_param(description, "description")
        self.loader = check.opt_inst_param(loader, "loader", DagsterTypeLoader)
        self.materializer = check.opt_inst_param(materializer, "materializer",
                                                 DagsterTypeMaterializer)

        self.serialization_strategy = check.opt_inst_param(
            serialization_strategy,
            "serialization_strategy",
            SerializationStrategy,
            PickleSerializationStrategy(),
        )
        self.required_resource_keys = check.opt_set_param(
            required_resource_keys,
            "required_resource_keys",
        )

        self._type_check_fn = check.callable_param(type_check_fn,
                                                   "type_check_fn")
        _validate_type_check_fn(self._type_check_fn, self._name)

        auto_plugins = check.opt_list_param(auto_plugins,
                                            "auto_plugins",
                                            of_type=type)

        check.param_invariant(
            all(
                issubclass(auto_plugin_type, TypeStoragePlugin)
                for auto_plugin_type in auto_plugins),
            "auto_plugins",
        )

        self.auto_plugins = auto_plugins

        self.is_builtin = check.bool_param(is_builtin, "is_builtin")
        check.invariant(
            self.display_name is not None,
            "All types must have a valid display name, got None for key {}".
            format(key),
        )

        self.kind = check.inst_param(kind, "kind", DagsterTypeKind)
Esempio n. 27
0
def hourly_schedule(
    pipeline_name,
    start_date,
    name=None,
    execution_time=datetime.time(0, 0),
    tags_fn_for_date=None,
    solid_selection=None,
    mode="default",
    should_execute=None,
    environment_vars=None,
    end_date=None,
    execution_timezone=None,
):
    """Create a schedule that runs hourly.

    The decorated function will be called as the ``run_config_fn`` of the underlying
    :py:class:`~dagster.ScheduleDefinition` and should take a
    :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the run config
    for the scheduled execution.

    Args:
        pipeline_name (str): The name of the pipeline to execute when the schedule runs.
        start_date (datetime.datetime): The date from which to run the schedule.
        name (Optional[str]): The name of the schedule to create. By default, this will be the name
            of the decorated function.
        execution_time (datetime.time): The time at which to execute the schedule. Only the minutes
            component will be respected -- the hour should be 0, and will be ignored if it is not 0.
        tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A
            function that generates tags to attach to the schedules runs. Takes the date of the
            schedule run and returns a dictionary of tags (string key-value pairs).
        solid_selection (Optional[List[str]]): A list of solid subselection (including single
            solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``
        mode (Optional[str]): The pipeline mode in which to execute this schedule.
            (Default: 'default')
        should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at
            schedule execution tie to determine whether a schedule should execute or skip. Takes a
            :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the
            schedule should execute). Defaults to a function that always returns ``True``.
        environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing
            the schedule.
        end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to
            current time.
        execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works
            with DagsterDaemonScheduler, and must be set when using that scheduler.
    """
    check.opt_str_param(name, "name")
    check.inst_param(start_date, "start_date", datetime.datetime)
    check.opt_inst_param(end_date, "end_date", datetime.datetime)
    check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")
    check.opt_nullable_list_param(solid_selection,
                                  "solid_selection",
                                  of_type=str)
    mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)
    check.opt_callable_param(should_execute, "should_execute")
    check.opt_dict_param(environment_vars,
                         "environment_vars",
                         key_type=str,
                         value_type=str)
    check.str_param(pipeline_name, "pipeline_name")
    check.inst_param(execution_time, "execution_time", datetime.time)
    check.opt_str_param(execution_timezone, "execution_timezone")

    if start_date.minute != 0 or start_date.second != 0:
        warnings.warn(
            "`start_date` must be at the beginning of the hour for an hourly schedule. "
            "Use `execution_time` to execute the schedule at a specific time within the hour. For "
            "example, to run the schedule each hour at 15 minutes past the hour starting at 3AM "
            "on 10/20/2020, your schedule definition would look like:"
            """
@hourly_schedule(
    start_date=datetime.datetime(2020, 10, 20, 3),
    execution_time=datetime.time(0, 15)
):
def my_schedule_definition(_):
    ...
""")

    if execution_time.hour != 0:
        warnings.warn(
            "Hourly schedule {schedule_name} created with:\n"
            "\tschedule_time=datetime.time(hour={hour}, minute={minute}, ...)."
            "Since this is an hourly schedule, the hour parameter will be ignored and the schedule "
            "will run on the {minute} mark for the previous hour interval. Replace "
            "datetime.time(hour={hour}, minute={minute}, ...) with "
            "datetime.time(minute={minute}, ...) to fix this warning.")

    cron_schedule = "{minute} * * * *".format(minute=execution_time.minute)

    fmt = (DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE
           if execution_timezone else DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE)

    execution_time_to_partition_fn = lambda d: pendulum.instance(d).subtract(
        hours=1, minutes=(execution_time.minute - start_date.minute) % 60)

    partition_fn = schedule_partition_range(
        start_date,
        end=end_date,
        cron_schedule=cron_schedule,
        fmt=fmt,
        timezone=execution_timezone,
        execution_time_to_partition_fn=execution_time_to_partition_fn,
    )

    def inner(fn):
        check.callable_param(fn, "fn")

        schedule_name = name or fn.__name__

        tags_fn_for_partition_value = lambda partition: {}
        if tags_fn_for_date:
            tags_fn_for_partition_value = lambda partition: tags_fn_for_date(
                partition.value)

        partition_set = PartitionSetDefinition(
            name="{}_partitions".format(schedule_name),
            pipeline_name=pipeline_name,
            partition_fn=partition_fn,
            run_config_fn_for_partition=lambda partition: fn(partition.value),
            solid_selection=solid_selection,
            tags_fn_for_partition=tags_fn_for_partition_value,
            mode=mode,
        )

        return partition_set.create_schedule_definition(
            schedule_name,
            cron_schedule,
            should_execute=should_execute,
            environment_vars=environment_vars,
            partition_selector=create_offset_partition_selector(
                execution_time_to_partition_fn=execution_time_to_partition_fn,
            ),
            execution_timezone=execution_timezone,
        )

    return inner
Esempio n. 28
0
 def __init__(self, resource_fn, config_field=None, description=None):
     self._resource_fn = check.callable_param(resource_fn, 'resource_fn')
     self._config_field = check_user_facing_opt_field_param(
         config_field, 'config_field',
         'of a ResourceDefinition or @resource')
     self._description = check.opt_str_param(description, 'description')
Esempio n. 29
0
    def __init__(
        self,
        name,
        cron_schedule,
        pipeline_name,
        environment_dict=None,
        environment_dict_fn=None,
        tags=None,
        tags_fn=None,
        solid_selection=None,
        mode="default",
        should_execute=None,
        environment_vars=None,
    ):
        check.str_param(name, 'name')
        check.str_param(cron_schedule, 'cron_schedule')
        check.str_param(pipeline_name, 'pipeline_name')
        check.opt_dict_param(environment_dict, 'environment_dict')
        check.opt_callable_param(environment_dict_fn, 'environment_dict_fn')
        check.opt_dict_param(tags, 'tags', key_type=str, value_type=str)
        check.opt_callable_param(tags_fn, 'tags_fn')
        check.opt_nullable_list_param(solid_selection,
                                      'solid_selection',
                                      of_type=str)
        mode = check.opt_str_param(mode, 'mode', DEFAULT_MODE_NAME)
        check.opt_callable_param(should_execute, 'should_execute')
        check.opt_dict_param(environment_vars,
                             'environment_vars',
                             key_type=str,
                             value_type=str)

        if environment_dict_fn and environment_dict:
            raise DagsterInvalidDefinitionError(
                'Attempted to provide both environment_dict_fn and environment_dict as arguments'
                ' to ScheduleDefinition. Must provide only one of the two.')

        if tags_fn and tags:
            raise DagsterInvalidDefinitionError(
                'Attempted to provide both tags_fn and tags as arguments'
                ' to ScheduleDefinition. Must provide only one of the two.')

        if not environment_dict and not environment_dict_fn:
            environment_dict_fn = lambda _context: {}

        if not tags and not tags_fn:
            tags_fn = lambda _context: {}

        if not should_execute:
            should_execute = lambda _context: True

        self._schedule_definition_data = ScheduleDefinitionData(
            name=check.str_param(name, 'name'),
            cron_schedule=check.str_param(cron_schedule, 'cron_schedule'),
            environment_vars=check.opt_dict_param(environment_vars,
                                                  'environment_vars'),
        )

        self._environment_dict = environment_dict
        self._environment_dict_fn = environment_dict_fn
        self._tags = tags
        self._tags_fn = tags_fn
        self._should_execute = should_execute
        self._mode = mode
        self._pipeline_name = pipeline_name
        self._solid_selection = solid_selection
Esempio n. 30
0
 def __init__(self, project=None):
     check.opt_str_param(project, 'project')
     super(BigQueryClient, self).__init__(project=project)