Example #1
0
def evaluate_composite_config(context):
    check.inst_param(context, 'context', TraversalContext)
    check.param_invariant(context.config_type.is_composite, 'composite_type')

    fields = context.config_type.fields

    if context.config_value and not isinstance(context.config_value, dict):
        return EvaluateValueResult.for_error(
            create_composite_type_mismatch_error(context))

    evaluate_value_result = _evaluate_composite_solid_config(context)
    if evaluate_value_result.errors or evaluate_value_result.value:
        return evaluate_value_result

    # ASK: this can crash on user error
    config_value = check.opt_dict_param(context.config_value,
                                        'incoming_value',
                                        key_type=str)

    defined_fields = set(fields.keys())
    incoming_fields = set(config_value.keys())
    extra_fields = list(incoming_fields - defined_fields)

    # We'll build up a dict of processed config values below
    errors = []
    output_config_value = {}

    # Here, we support permissive composites. In cases where we know the set of permissible keys a
    # priori, we validate against the config:
    if not context.config_type.is_permissive_composite:
        if extra_fields:
            if len(extra_fields) == 1:
                errors.append(
                    create_field_not_defined_error(context, extra_fields[0]))
            else:
                errors.append(
                    create_fields_not_defined_error(context, extra_fields))

    # And for permissive fields, we just pass along to the output without further validation
    else:
        for field_name in extra_fields:
            output_config_value[field_name] = config_value[field_name]

    # ...However, for any fields the user *has* told us about, we validate against their config
    # specifications
    missing_fields = []

    for key, field_def in fields.items():
        if key in incoming_fields:
            evaluate_value_result = _evaluate_config(
                context.for_field(field_def, key,
                                  context.config_value.get(key, {})))
            if evaluate_value_result.errors:
                errors += evaluate_value_result.errors
            else:
                output_config_value[key] = evaluate_value_result.value
        elif is_solid_dict(
                field_def.config_type) and context.config_value is not None:
            evaluate_value_result = _evaluate_config(
                context.for_field(field_def, key,
                                  context.config_value.get(key, {})))
            if evaluate_value_result.errors:
                missing_fields.append(key)
            else:
                output_config_value[key] = evaluate_value_result.value
        elif field_def.is_optional:
            # Try to see if this is a composite solid
            speculative_composite_solid_result = _evaluate_composite_solid_config(
                context.for_field(
                    field_def, key, field_def.default_value
                    if field_def.default_provided else {}))
            if speculative_composite_solid_result.value is not None:
                output_config_value[
                    key] = speculative_composite_solid_result.value
            else:
                if field_def.default_provided:
                    output_config_value[key] = field_def.default_value

        else:
            check.invariant(not field_def.default_provided)
            missing_fields.append(key)

    if missing_fields:
        if len(missing_fields) == 1:
            errors.append(
                create_missing_required_field_error(context,
                                                    missing_fields[0]))
        else:
            errors.append(
                create_missing_required_fields_error(context, missing_fields))

    if errors:
        return EvaluateValueResult.for_errors(errors)
    else:
        return EvaluateValueResult.for_value(output_config_value)
Example #2
0
def monthly_schedule(
    pipeline_name,
    start_date,
    name=None,
    execution_day_of_month=1,
    execution_time=datetime.time(0, 0),
    tags_fn_for_date=None,
    solid_selection=None,
    mode="default",
    should_execute=None,
    environment_vars=None,
    end_date=None,
):
    '''Create a schedule that runs monthly.

    The decorated function will be called as the ``run_config_fn`` of the underlying
    :py:class:`~dagster.ScheduleDefinition` and should take a
    :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the environment
    dict for the scheduled execution.

    Args:
        pipeline_name (str): The name of the pipeline to execute when the schedule runs.
        start_date (datetime.datetime): The date from which to run the schedule.
        name (Optional[str]): The name of the schedule to create.
        execution_day_of_month (int): The day of the month on which to run the schedule (must be
            between 0 and 31).
        execution_time (datetime.time): The time at which to execute the schedule.
        tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A
            function that generates tags to attach to the schedules runs. Takes the date of the
            schedule run and returns a dictionary of tags (string key-value pairs).
        solid_selection (Optional[List[str]]): A list of solid subselection (including single
            solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``
        mode (Optional[str]): The pipeline mode in which to execute this schedule.
            (Default: 'default')
        should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at
            schedule execution tie to determine whether a schedule should execute or skip. Takes a
            :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the
            schedule should execute). Defaults to a function that always returns ``True``.
        environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing
            the schedule.
        end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to
            current time.
    '''
    check.opt_str_param(name, 'name')
    check.inst_param(start_date, 'start_date', datetime.datetime)
    check.opt_inst_param(end_date, 'end_date', datetime.datetime)
    check.opt_callable_param(tags_fn_for_date, 'tags_fn_for_date')
    check.opt_nullable_list_param(solid_selection,
                                  'solid_selection',
                                  of_type=str)
    mode = check.opt_str_param(mode, 'mode', DEFAULT_MODE_NAME)
    check.opt_callable_param(should_execute, 'should_execute')
    check.opt_dict_param(environment_vars,
                         'environment_vars',
                         key_type=str,
                         value_type=str)
    check.str_param(pipeline_name, 'pipeline_name')
    check.int_param(execution_day_of_month, 'execution_day')
    check.inst_param(execution_time, 'execution_time', datetime.time)

    if execution_day_of_month <= 0 or execution_day_of_month > 31:
        raise DagsterInvalidDefinitionError(
            '`execution_day_of_month={}` is not valid for monthly schedule. Execution day must be '
            'between 1 and 31'.format(execution_day_of_month))

    cron_schedule = '{minute} {hour} {day} * *'.format(
        minute=execution_time.minute,
        hour=execution_time.hour,
        day=execution_day_of_month)

    partition_fn = date_partition_range(start_date,
                                        end=end_date,
                                        delta=relativedelta(months=1),
                                        fmt="%Y-%m")

    def inner(fn):
        check.callable_param(fn, 'fn')

        schedule_name = name or fn.__name__

        tags_fn_for_partition_value = lambda partition: {}
        if tags_fn_for_date:
            tags_fn_for_partition_value = lambda partition: tags_fn_for_date(
                partition.value)

        partition_set = PartitionSetDefinition(
            name='{}_partitions'.format(schedule_name),
            pipeline_name=pipeline_name,
            partition_fn=partition_fn,
            run_config_fn_for_partition=lambda partition: fn(partition.value),
            solid_selection=solid_selection,
            tags_fn_for_partition=tags_fn_for_partition_value,
            mode=mode,
        )

        return partition_set.create_schedule_definition(
            schedule_name,
            cron_schedule,
            should_execute=should_execute,
            environment_vars=environment_vars,
        )

    return inner
Example #3
0
def weekly_schedule(
    pipeline_name: str,
    start_date: datetime.datetime,
    name: Optional[str] = None,
    execution_day_of_week: int = 0,
    execution_time: datetime.time = datetime.time(0, 0),
    tags_fn_for_date: Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]] = None,
    solid_selection: Optional[List[str]] = None,
    mode: Optional[str] = "default",
    should_execute: Optional[Callable[["ScheduleExecutionContext"], bool]] = None,
    environment_vars: Optional[Dict[str, str]] = None,
    end_date: Optional[datetime.datetime] = None,
    execution_timezone: Optional[str] = None,
    partition_weeks_offset: Optional[int] = 1,
    description: Optional[str] = None,
) -> Callable[[Callable[[datetime.datetime], Dict[str, Any]]], PartitionScheduleDefinition]:
    """Create a partitioned schedule that runs daily.

    The decorated function should accept a datetime object as its only argument. The datetime
    represents the date partition that it's meant to run on.

    The decorated function should return a run configuration dictionary, which will be used as
    configuration for the scheduled run.

    The decorator produces a :py:class:`~dagster.PartitionScheduleDefinition`.
    Args:
        pipeline_name (str): The name of the pipeline to execute when the schedule runs.
        start_date (datetime.datetime): The date from which to run the schedule.
        name (Optional[str]): The name of the schedule to create.
        execution_day_of_week (int): The day of the week on which to run the schedule. Must be
            between 0 (Sunday) and 6 (Saturday).
        execution_time (datetime.time): The time at which to execute the schedule.
        tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A
            function that generates tags to attach to the schedules runs. Takes the date of the
            schedule run and returns a dictionary of tags (string key-value pairs).
        solid_selection (Optional[List[str]]): A list of solid subselection (including single
            solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``
        mode (Optional[str]): The pipeline mode in which to execute this schedule.
            (Default: 'default')
        should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at
            schedule execution tie to determine whether a schedule should execute or skip. Takes a
            :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the
            schedule should execute). Defaults to a function that always returns ``True``.
        environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing
            the schedule.
        end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to
            current time.
        execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works
            with DagsterDaemonScheduler, and must be set when using that scheduler.
        partition_weeks_offset (Optional[int]): How many weeks back to go when choosing the partition
            for a given schedule execution. For example, when partition_weeks_offset=1, the schedule
            that executes during week N will fill in the partition for week N-1.
            (Default: 1)
        description (Optional[str]): A human-readable description of the schedule.
    """
    check.opt_str_param(name, "name")
    check.inst_param(start_date, "start_date", datetime.datetime)
    check.opt_inst_param(end_date, "end_date", datetime.datetime)
    check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")
    check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str)
    mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)
    check.opt_callable_param(should_execute, "should_execute")
    check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str)
    check.str_param(pipeline_name, "pipeline_name")
    check.int_param(execution_day_of_week, "execution_day_of_week")
    check.inst_param(execution_time, "execution_time", datetime.time)
    check.opt_str_param(execution_timezone, "execution_timezone")
    check.opt_int_param(partition_weeks_offset, "partition_weeks_offset")
    check.opt_str_param(description, "description")

    if start_date.hour != 0 or start_date.minute != 0 or start_date.second != 0:
        warnings.warn(
            "`start_date` must be at the beginning of a day for a weekly schedule. "
            "Use `execution_time` to execute the schedule at a specific time of day. For example, "
            "to run the schedule at 3AM each Tuesday starting on 10/20/2020, your schedule "
            "definition would look like:"
            """
@weekly_schedule(
    start_date=datetime.datetime(2020, 10, 20),
    execution_day_of_week=1,
    execution_time=datetime.time(3, 0)
):
def my_schedule_definition(_):
    ...
"""
        )

    if execution_day_of_week < 0 or execution_day_of_week >= 7:
        raise DagsterInvalidDefinitionError(
            "`execution_day_of_week={}` is not valid for weekly schedule. Execution day must be "
            "between 0 [Sunday] and 6 [Saturday]".format(execution_day_of_week)
        )

    cron_schedule = "{minute} {hour} * * {day}".format(
        minute=execution_time.minute, hour=execution_time.hour, day=execution_day_of_week
    )

    fmt = DEFAULT_DATE_FORMAT

    day_difference = (execution_day_of_week - (start_date.weekday() + 1)) % 7

    execution_time_to_partition_fn = (
        lambda d: pendulum.instance(d)
        .replace(hour=0, minute=0)
        .subtract(weeks=partition_weeks_offset, days=day_difference)
    )

    partition_fn = schedule_partition_range(
        start_date,
        end=end_date,
        cron_schedule=cron_schedule,
        fmt=fmt,
        timezone=execution_timezone,
        execution_time_to_partition_fn=execution_time_to_partition_fn,
        inclusive=(partition_weeks_offset == 0),
    )

    def inner(fn: Callable[[datetime.datetime], Dict[str, Any]]) -> PartitionScheduleDefinition:
        check.callable_param(fn, "fn")

        schedule_name = name or fn.__name__

        tags_fn_for_partition_value: Callable[
            ["Partition"], Optional[Dict[str, str]]
        ] = lambda partition: {}
        if tags_fn_for_date:
            tags_fn = cast(
                Callable[[datetime.datetime], Optional[Dict[str, str]]], tags_fn_for_date
            )
            tags_fn_for_partition_value = lambda partition: tags_fn(partition.value)

        partition_set = PartitionSetDefinition(
            name="{}_partitions".format(schedule_name),
            pipeline_name=pipeline_name,
            partition_fn=partition_fn,
            run_config_fn_for_partition=lambda partition: fn(partition.value),
            solid_selection=solid_selection,
            tags_fn_for_partition=tags_fn_for_partition_value,
            mode=mode,
        )

        return partition_set.create_schedule_definition(
            schedule_name,
            cron_schedule,
            should_execute=should_execute,
            environment_vars=environment_vars,
            partition_selector=create_offset_partition_selector(
                execution_time_to_partition_fn=execution_time_to_partition_fn,
            ),
            execution_timezone=execution_timezone,
            description=description,
        )

    return inner
Example #4
0
    def __new__(
        cls,
        pipeline_name=None,
        run_id=None,
        environment_dict=None,
        mode=None,
        solid_selection=None,
        solids_to_execute=None,
        step_keys_to_execute=None,
        status=None,
        tags=None,
        root_run_id=None,
        parent_run_id=None,
        pipeline_snapshot_id=None,
        execution_plan_snapshot_id=None,
        ## GRAVEYARD BELOW
        # see https://github.com/dagster-io/dagster/issues/2372 for explanation
        previous_run_id=None,
        selector=None,
        solid_subset=None,
    ):
        # a frozenset which contains the names of the solids to execute
        check.opt_set_param(solids_to_execute, 'solids_to_execute', of_type=str)
        # a list of solid queries provided by the user
        # possible to be None when only solids_to_execute is set by the user directly
        check.opt_list_param(solid_selection, 'solid_selection', of_type=str)

        check.opt_list_param(step_keys_to_execute, 'step_keys_to_execute', of_type=str)

        check.opt_str_param(root_run_id, 'root_run_id')
        check.opt_str_param(parent_run_id, 'parent_run_id')

        check.invariant(
            (root_run_id is not None and parent_run_id is not None)
            or (root_run_id is None and parent_run_id is None),
            (
                'Must set both root_run_id and parent_run_id when creating a PipelineRun that '
                'belongs to a run group'
            ),
        )

        # Compatibility
        # ----------------------------------------------------------------------------------------
        # Historical runs may have previous_run_id set, in which case
        # that previous ID becomes both the root and the parent
        if previous_run_id:
            if not (parent_run_id and root_run_id):
                parent_run_id = previous_run_id
                root_run_id = previous_run_id

        check.opt_inst_param(selector, 'selector', ExecutionSelector)
        if selector:
            check.invariant(
                pipeline_name is None or selector.name == pipeline_name,
                (
                    'Conflicting pipeline name {pipeline_name} in arguments to PipelineRun: '
                    'selector was passed with pipeline {selector_pipeline}'.format(
                        pipeline_name=pipeline_name, selector_pipeline=selector.name
                    )
                ),
            )
            if pipeline_name is None:
                pipeline_name = selector.name

            check.invariant(
                solids_to_execute is None or set(selector.solid_subset) == solids_to_execute,
                (
                    'Conflicting solids_to_execute {solids_to_execute} in arguments to PipelineRun: '
                    'selector was passed with subset {selector_subset}'.format(
                        solids_to_execute=solids_to_execute, selector_subset=selector.solid_subset
                    )
                ),
            )
            # for old runs that only have selector but no solids_to_execute
            if solids_to_execute is None:
                solids_to_execute = (
                    frozenset(selector.solid_subset) if selector.solid_subset else None
                )

        # for old runs that specified list-type solid_subset
        check.opt_list_param(solid_subset, 'solid_subset', of_type=str)
        if solid_subset:
            solids_to_execute = frozenset(solid_subset)
        # ----------------------------------------------------------------------------------------

        return super(PipelineRun, cls).__new__(
            cls,
            pipeline_name=check.opt_str_param(pipeline_name, 'pipeline_name'),
            run_id=check.opt_str_param(run_id, 'run_id', default=make_new_run_id()),
            environment_dict=check.opt_dict_param(
                environment_dict, 'environment_dict', key_type=str
            ),
            mode=check.opt_str_param(mode, 'mode'),
            solid_selection=solid_selection,
            solids_to_execute=solids_to_execute,
            step_keys_to_execute=step_keys_to_execute,
            status=check.opt_inst_param(
                status, 'status', PipelineRunStatus, PipelineRunStatus.NOT_STARTED
            ),
            tags=check.opt_dict_param(tags, 'tags', key_type=str),
            root_run_id=root_run_id,
            parent_run_id=parent_run_id,
            pipeline_snapshot_id=check.opt_str_param(pipeline_snapshot_id, 'pipeline_snapshot_id'),
            execution_plan_snapshot_id=check.opt_str_param(
                execution_plan_snapshot_id, 'execution_plan_snapshot_id'
            ),
        )
Example #5
0
 def __new__(cls, data: Optional[Dict[str, Any]]):
     return super(JsonMetadataEntryData, cls).__new__(
         cls, check.opt_dict_param(data, "data", key_type=str)
     )
Example #6
0
def test_opt_dict_param():
    assert check.opt_dict_param(None, 'opt_dict_param') == {}
    assert check.opt_dict_param({}, 'opt_dict_param') == {}
    ddict = {'a': 2}
    assert check.opt_dict_param(ddict, 'opt_dict_param') == ddict

    with pytest.raises(ParameterCheckError):
        check.opt_dict_param(0, 'opt_dict_param')

    with pytest.raises(ParameterCheckError):
        check.opt_dict_param(1, 'opt_dict_param')

    with pytest.raises(ParameterCheckError):
        check.opt_dict_param('foo', 'opt_dict_param')

    with pytest.raises(ParameterCheckError):
        check.opt_dict_param(['foo'], 'opt_dict_param')

    with pytest.raises(ParameterCheckError):
        check.opt_dict_param([], 'opt_dict_param')
Example #7
0
def test_opt_dict_param_with_type():
    str_to_int = {"str": 1}
    assert check.opt_dict_param(str_to_int, "str_to_int", key_type=str, value_type=int)
    assert check.opt_dict_param(str_to_int, "str_to_int", value_type=int)
    assert check.opt_dict_param(str_to_int, "str_to_int", key_type=str)
    assert check.opt_dict_param(str_to_int, "str_to_int")

    assert check.opt_dict_param({}, "str_to_int", key_type=str, value_type=int) == {}
    assert check.opt_dict_param({}, "str_to_int", value_type=int) == {}
    assert check.opt_dict_param({}, "str_to_int", key_type=str) == {}
    assert check.opt_dict_param({}, "str_to_int") == {}

    assert check.opt_dict_param(None, "str_to_int", key_type=str, value_type=int) == {}
    assert check.opt_dict_param(None, "str_to_int", value_type=int) == {}
    assert check.opt_dict_param(None, "str_to_int", key_type=str) == {}
    assert check.opt_dict_param(None, "str_to_int") == {}

    assert check.opt_dict_param(
        {"str": 1, "str2": "str", 1: "str", 2: "str"},
        "multi_type_dict",
        key_type=(str, int),
        value_type=(str, int),
    )

    class Wrong:
        pass

    with pytest.raises(CheckError):
        assert check.opt_dict_param(str_to_int, "str_to_int", key_type=Wrong, value_type=Wrong)

    with pytest.raises(CheckError):
        assert check.opt_dict_param(str_to_int, "str_to_int", key_type=Wrong, value_type=int)

    with pytest.raises(CheckError):
        assert check.opt_dict_param(str_to_int, "str_to_int", key_type=str, value_type=Wrong)

    with pytest.raises(CheckError):
        assert check.opt_dict_param(str_to_int, "str_to_int", key_type=Wrong)

    with pytest.raises(CheckError):
        assert check.opt_dict_param(str_to_int, "str_to_int", value_type=Wrong)

    class AlsoWrong:
        pass

    with pytest.raises(CheckError):
        assert check.dict_param(str_to_int, "str_to_int", key_type=(Wrong, AlsoWrong))

    with pytest.raises(CheckError):
        assert check.dict_param(str_to_int, "str_to_int", value_type=(Wrong, AlsoWrong))
Example #8
0
    def create_schedule_definition(
        self,
        schedule_name,
        cron_schedule,
        should_execute=None,
        partition_selector=last_partition,
        environment_vars=None,
    ):
        '''Create a ScheduleDefinition from a PartitionSetDefinition.

        Arguments:
            schedule_name (str): The name of the schedule.
            cron_schedule (str): A valid cron string for the schedule
            should_execute (Optional[function]): Function that runs at schedule execution time that
            determines whether a schedule should execute. Defaults to a function that always returns
            ``True``.
            partition_selector (Callable[PartitionSet], Partition): A partition selector for the
                schedule.
            environment_vars (Optional[dict]): The environment variables to set for the schedule.

        Returns:
            ScheduleDefinition: The generated ScheduleDefinition for the partition selector
        '''

        check.str_param(schedule_name, 'schedule_name')
        check.str_param(cron_schedule, 'cron_schedule')
        check.opt_callable_param(should_execute, 'should_execute')
        check.opt_dict_param(environment_vars,
                             'environment_vars',
                             key_type=str,
                             value_type=str)
        check.callable_param(partition_selector, 'partition_selector')

        def _should_execute_wrapper(context):
            check.inst_param(context, 'context', ScheduleExecutionContext)
            selected_partition = partition_selector(context, self)
            if not selected_partition:
                return False
            elif not should_execute:
                return True
            else:
                return should_execute(context)

        def _run_config_fn_wrapper(context):
            check.inst_param(context, 'context', ScheduleExecutionContext)
            selected_partition = partition_selector(context, self)
            if not selected_partition:
                raise DagsterInvariantViolationError(
                    "The partition selection function `{selector}` did not return "
                    "a partition from PartitionSet {partition_set}".format(
                        selector=getattr(partition_selector, '__name__',
                                         repr(partition_selector)),
                        partition_set=self.name,
                    ))

            return self.run_config_for_partition(selected_partition)

        def _tags_fn_wrapper(context):
            check.inst_param(context, 'context', ScheduleExecutionContext)
            selected_partition = partition_selector(context, self)
            if not selected_partition:
                raise DagsterInvariantViolationError(
                    "The partition selection function `{selector}` did not return "
                    "a partition from PartitionSet {partition_set}".format(
                        selector=getattr(partition_selector, '__name__',
                                         repr(partition_selector)),
                        partition_set=self.name,
                    ))

            return self.tags_for_partition(selected_partition)

        return PartitionScheduleDefinition(
            name=schedule_name,
            cron_schedule=cron_schedule,
            pipeline_name=self.pipeline_name,
            run_config_fn=_run_config_fn_wrapper,
            tags_fn=_tags_fn_wrapper,
            solid_selection=self.solid_selection,
            mode=self.mode,
            should_execute=_should_execute_wrapper,
            environment_vars=environment_vars,
            partition_set=self,
        )
Example #9
0
def monthly_schedule(
    pipeline_name,
    start_date,
    name=None,
    execution_day_of_month=1,
    execution_time=datetime.time(0, 0),
    tags_fn_for_date=None,
    solid_selection=None,
    mode="default",
    should_execute=None,
    environment_vars=None,
    end_date=None,
    execution_timezone=None,
):
    """Create a schedule that runs monthly.

    The decorated function will be called as the ``run_config_fn`` of the underlying
    :py:class:`~dagster.ScheduleDefinition` and should take a
    :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the environment
    dict for the scheduled execution.

    Args:
        pipeline_name (str): The name of the pipeline to execute when the schedule runs.
        start_date (datetime.datetime): The date from which to run the schedule.
        name (Optional[str]): The name of the schedule to create.
        execution_day_of_month (int): The day of the month on which to run the schedule (must be
            between 0 and 31).
        execution_time (datetime.time): The time at which to execute the schedule.
        tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A
            function that generates tags to attach to the schedules runs. Takes the date of the
            schedule run and returns a dictionary of tags (string key-value pairs).
        solid_selection (Optional[List[str]]): A list of solid subselection (including single
            solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``
        mode (Optional[str]): The pipeline mode in which to execute this schedule.
            (Default: 'default')
        should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at
            schedule execution tie to determine whether a schedule should execute or skip. Takes a
            :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the
            schedule should execute). Defaults to a function that always returns ``True``.
        environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing
            the schedule.
        end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to
            current time.
        execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works
            with DagsterDaemonScheduler, and must be set when using that scheduler.
    """
    check.opt_str_param(name, "name")
    check.inst_param(start_date, "start_date", datetime.datetime)
    check.opt_inst_param(end_date, "end_date", datetime.datetime)
    check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")
    check.opt_nullable_list_param(solid_selection,
                                  "solid_selection",
                                  of_type=str)
    mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)
    check.opt_callable_param(should_execute, "should_execute")
    check.opt_dict_param(environment_vars,
                         "environment_vars",
                         key_type=str,
                         value_type=str)
    check.str_param(pipeline_name, "pipeline_name")
    check.int_param(execution_day_of_month, "execution_day")
    check.inst_param(execution_time, "execution_time", datetime.time)
    check.opt_str_param(execution_timezone, "execution_timezone")

    if (start_date.day != 1 or start_date.hour != 0 or start_date.minute != 0
            or start_date.second != 0):
        warnings.warn(
            "`start_date` must be at the beginning of the first day of the month for a monthly "
            "schedule. Use `execution_day_of_month` and `execution_time` to execute the schedule "
            "at a specific time within the month. For example, to run the schedule at 3AM on the "
            "23rd of each month starting in October, your schedule definition would look like:"
            """
@monthly_schedule(
    start_date=datetime.datetime(2020, 10, 1),
    execution_day_of_month=23,
    execution_time=datetime.time(3, 0)
):
def my_schedule_definition(_):
    ...
""")

    if execution_day_of_month <= 0 or execution_day_of_month > 31:
        raise DagsterInvalidDefinitionError(
            "`execution_day_of_month={}` is not valid for monthly schedule. Execution day must be "
            "between 1 and 31".format(execution_day_of_month))

    cron_schedule = "{minute} {hour} {day} * *".format(
        minute=execution_time.minute,
        hour=execution_time.hour,
        day=execution_day_of_month)

    fmt = DEFAULT_MONTHLY_FORMAT

    execution_time_to_partition_fn = (lambda d: pendulum.instance(d).replace(
        hour=0, minute=0).subtract(months=1, days=execution_day_of_month - 1))

    partition_fn = schedule_partition_range(
        start_date,
        end=end_date,
        cron_schedule=cron_schedule,
        fmt=fmt,
        timezone=execution_timezone,
        execution_time_to_partition_fn=execution_time_to_partition_fn,
    )

    def inner(fn):
        check.callable_param(fn, "fn")

        schedule_name = name or fn.__name__

        tags_fn_for_partition_value = lambda partition: {}
        if tags_fn_for_date:
            tags_fn_for_partition_value = lambda partition: tags_fn_for_date(
                partition.value)

        partition_set = PartitionSetDefinition(
            name="{}_partitions".format(schedule_name),
            pipeline_name=pipeline_name,
            partition_fn=partition_fn,
            run_config_fn_for_partition=lambda partition: fn(partition.value),
            solid_selection=solid_selection,
            tags_fn_for_partition=tags_fn_for_partition_value,
            mode=mode,
        )

        return partition_set.create_schedule_definition(
            schedule_name,
            cron_schedule,
            should_execute=should_execute,
            environment_vars=environment_vars,
            partition_selector=create_offset_partition_selector(
                execution_time_to_partition_fn=execution_time_to_partition_fn),
            execution_timezone=execution_timezone,
        )

    return inner
Example #10
0
    def to_job(
        self,
        name: Optional[str] = None,
        description: Optional[str] = None,
        resource_defs: Optional[Dict[str, ResourceDefinition]] = None,
        config: Union[ConfigMapping, Dict[str, Any],
                      "PartitionedConfig"] = None,
        tags: Optional[Dict[str, Any]] = None,
        logger_defs: Optional[Dict[str, LoggerDefinition]] = None,
        executor_def: Optional["ExecutorDefinition"] = None,
        hooks: Optional[AbstractSet[HookDefinition]] = None,
        op_retry_policy: Optional[RetryPolicy] = None,
        version_strategy: Optional[VersionStrategy] = None,
        op_selection: Optional[List[str]] = None,
        partitions_def: Optional["PartitionsDefinition"] = None,
    ) -> "JobDefinition":
        """
        Make this graph in to an executable Job by providing remaining components required for execution.

        Args:
            name (Optional[str]):
                The name for the Job. Defaults to the name of the this graph.
            resource_defs (Optional[Dict[str, ResourceDefinition]]):
                Resources that are required by this graph for execution.
                If not defined, `io_manager` will default to filesystem.
            config:
                Describes how the job is parameterized at runtime.

                If no value is provided, then the schema for the job's run config is a standard
                format based on its solids and resources.

                If a dictionary is provided, then it must conform to the standard config schema, and
                it will be used as the job's run config for the job whenever the job is executed.
                The values provided will be viewable and editable in the Dagit playground, so be
                careful with secrets.

                If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is
                determined by the config mapping, and the ConfigMapping, which should return
                configuration in the standard format to configure the job.

                If a :py:class:`PartitionedConfig` object is provided, then it defines a discrete set of config
                values that can parameterize the job, as well as a function for mapping those
                values to the base config. The values provided will be viewable and editable in the
                Dagit playground, so be careful with secrets.
            tags (Optional[Dict[str, Any]]):
                Arbitrary metadata for any execution of the Job.
                Values that are not strings will be json encoded and must meet the criteria that
                `json.loads(json.dumps(value)) == value`.  These tag values may be overwritten by tag
                values provided at invocation time.
            logger_defs (Optional[Dict[str, LoggerDefinition]]):
                A dictionary of string logger identifiers to their implementations.
            executor_def (Optional[ExecutorDefinition]):
                How this Job will be executed. Defaults to :py:class:`multi_or_in_process_executor`,
                which can be switched between multi-process and in-process modes of execution. The
                default mode of execution is multi-process.
            op_retry_policy (Optional[RetryPolicy]): The default retry policy for all ops in this job.
                Only used if retry policy is not defined on the op definition or op invocation.
            version_strategy (Optional[VersionStrategy]):
                Defines how each solid (and optionally, resource) in the job can be versioned. If
                provided, memoizaton will be enabled for this job.
            partitions_def (Optional[PartitionsDefinition]): Defines a discrete set of partition
                keys that can parameterize the job. If this argument is supplied, the config
                argument can't also be supplied.

        Returns:
            JobDefinition
        """
        from .job_definition import JobDefinition
        from .partition import PartitionedConfig, PartitionsDefinition
        from .executor_definition import ExecutorDefinition, multi_or_in_process_executor

        job_name = check_valid_name(name or self.name)

        tags = check.opt_dict_param(tags, "tags", key_type=str)
        executor_def = check.opt_inst_param(
            executor_def,
            "executor_def",
            ExecutorDefinition,
            default=multi_or_in_process_executor)

        if resource_defs and "io_manager" in resource_defs:
            resource_defs_with_defaults = resource_defs
        else:
            resource_defs_with_defaults = merge_dicts(
                {"io_manager": default_job_io_manager}, resource_defs or {})

        hooks = check.opt_set_param(hooks, "hooks", of_type=HookDefinition)
        op_retry_policy = check.opt_inst_param(op_retry_policy,
                                               "op_retry_policy", RetryPolicy)
        op_selection = check.opt_list_param(op_selection,
                                            "op_selection",
                                            of_type=str)
        presets = []
        config_mapping = None
        partitioned_config = None

        if partitions_def:
            check.inst_param(partitions_def, "partitions_def",
                             PartitionsDefinition)
            check.invariant(
                config is None,
                "Can't supply both the 'config' and 'partitions_def' arguments"
            )
            partitioned_config = PartitionedConfig(partitions_def,
                                                   lambda _: {})

        if isinstance(config, ConfigMapping):
            config_mapping = config
        elif isinstance(config, PartitionedConfig):
            partitioned_config = config
        elif isinstance(config, dict):
            presets = [PresetDefinition(name="default", run_config=config)]
            # Using config mapping here is a trick to make it so that the preset will be used even
            # when no config is supplied for the job.
            config_mapping = _config_mapping_with_default_value(
                self._get_config_schema(resource_defs_with_defaults,
                                        executor_def, logger_defs),
                config,
                job_name,
                self.name,
            )
        elif config is not None:
            check.failed(
                f"config param must be a ConfigMapping, a PartitionedConfig, or a dictionary, but "
                f"is an object of type {type(config)}")

        return JobDefinition(
            name=job_name,
            description=description or self.description,
            graph_def=self,
            mode_def=ModeDefinition(
                resource_defs=resource_defs_with_defaults,
                logger_defs=logger_defs,
                executor_defs=[executor_def],
                _config_mapping=config_mapping,
                _partitioned_config=partitioned_config,
            ),
            preset_defs=presets,
            tags=tags,
            hook_defs=hooks,
            version_strategy=version_strategy,
            op_retry_policy=op_retry_policy,
        ).get_job_def_for_op_selection(op_selection)
Example #11
0
    def execute_in_process(
        self,
        run_config: Any = None,
        instance: Optional["DagsterInstance"] = None,
        resources: Optional[Dict[str, Any]] = None,
        raise_on_error: bool = True,
        op_selection: Optional[List[str]] = None,
    ) -> "ExecuteInProcessResult":
        """
        Execute this graph in-process, collecting results in-memory.

        Args:
            run_config (Optional[Dict[str, Any]]):
                Run config to provide to execution. The configuration for the underlying graph
                should exist under the "ops" key.
            instance (Optional[DagsterInstance]):
                The instance to execute against, an ephemeral one will be used if none provided.
            resources (Optional[Dict[str, Any]]):
                The resources needed if any are required. Can provide resource instances directly,
                or resource definitions.
            raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.
                Defaults to ``True``.
            op_selection (Optional[List[str]]): A list of op selection queries (including single op
                names) to execute. For example:
                * ``['some_op']``: selects ``some_op`` itself.
                * ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).
                * ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants
                (downstream dependencies) within 3 levels down.
                * ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its
                ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.

        Returns:
            :py:class:`~dagster.ExecuteInProcessResult`
        """
        from dagster.core.execution.build_resources import wrap_resources_for_execution
        from dagster.core.execution.execute_in_process import core_execute_in_process
        from dagster.core.instance import DagsterInstance
        from .job_definition import JobDefinition
        from .executor_definition import execute_in_process_executor

        instance = check.opt_inst_param(instance, "instance", DagsterInstance)
        resources = check.opt_dict_param(resources, "resources", key_type=str)

        resource_defs = wrap_resources_for_execution(resources)
        in_proc_mode = ModeDefinition(
            executor_defs=[execute_in_process_executor],
            resource_defs=resource_defs)
        ephemeral_job = JobDefinition(
            name=self._name, graph_def=self,
            mode_def=in_proc_mode).get_job_def_for_op_selection(op_selection)

        run_config = run_config if run_config is not None else {}
        op_selection = check.opt_list_param(op_selection, "op_selection", str)

        return core_execute_in_process(
            node=self,
            ephemeral_pipeline=ephemeral_job,
            run_config=run_config,
            instance=instance,
            output_capturing_enabled=True,
            raise_on_error=raise_on_error,
        )
Example #12
0
def _create_lakehouse_table_def(
    name,
    lakehouse_fn,
    input_tables=None,
    other_input_defs=None,
    required_resource_keys=None,
    metadata=None,
    description=None,
):
    metadata = check.opt_dict_param(metadata, 'metadata')
    input_tables = check.opt_list_param(input_tables,
                                        input_tables,
                                        of_type=LakehouseTableInputDefinition)
    other_input_defs = check.opt_list_param(other_input_defs,
                                            other_input_defs,
                                            of_type=InputDefinition)
    required_resource_keys = check.opt_set_param(required_resource_keys,
                                                 'required_resource_keys',
                                                 of_type=str)

    table_type = define_python_dagster_type(python_type=ITableHandle,
                                            name=name,
                                            description=description)

    table_type_inst = table_type.inst()

    table_input_dict = {
        input_table.name: input_table
        for input_table in input_tables
    }
    input_defs = input_tables + other_input_defs
    validate_solid_fn('@solid', name, lakehouse_fn, input_defs,
                      [('context', )])

    def _compute(context, inputs):
        '''
        Workhouse function of lakehouse. The inputs are something that inherits from ITableHandle.
        This compute_fn:
        (1) Iterates over input tables and ask the lakehouse resource to
         hydrate their contents or a representation of their contents
         (e.g a pyspark dataframe) into memory for computation
        (2) Pass those into the lakehouse table function. Do the actual thing.
        (3) Pass the output of the lakehouse function to the lakehouse materialize function.
        (4) Yield a materialization if the lakehouse function returned that.


        There's an argument that the hydrate and materialize functions should return
        a stream of events but that started to feel like I was implementing what should
        be a framework feature.
        '''
        check.inst_param(context.resources.lakehouse,
                         'context.resources.lakehouse', Lakehouse)

        # hydrate tables
        hydrated_tables = {}
        other_inputs = {}
        for input_name, value in inputs.items():
            context.log.info(
                'About to hydrate table {input_name} for use in {name}'.format(
                    input_name=input_name, name=name))
            if input_name in table_input_dict:
                table_handle = value
                input_type = table_input_dict[input_name].runtime_type
                hydrated_tables[
                    input_name] = context.resources.lakehouse.hydrate(
                        context,
                        input_type,
                        table_def_of_type(context.pipeline_def,
                                          input_type.name).metadata,
                        table_handle,
                    )
            else:
                other_inputs[input_name] = value

        # call user-provided business logic which operates on the hydrated values
        # (as opposed to the handles)
        computed_output = lakehouse_fn(context, **hydrated_tables,
                                       **other_inputs)

        materialization, output_table_handle = context.resources.lakehouse.materialize(
            context, table_type_inst, metadata, computed_output)

        if materialization:
            yield materialization

        # just pass in a dummy handle for now if the materialize function
        # does not return one
        yield Output(
            output_table_handle if output_table_handle else TableHandle())

    required_resource_keys.add('lakehouse')

    return LakehouseTableDefinition(
        lakehouse_fn=lakehouse_fn,
        name=name,
        input_tables=input_tables,
        input_defs=input_defs,
        output_defs=[OutputDefinition(table_type)],
        compute_fn=_compute,
        required_resource_keys=required_resource_keys,
        metadata=metadata,
        description=description,
    )
Example #13
0
def execute_solid(
    solid_def,
    mode_def=None,
    input_values=None,
    environment_dict=None,
    run_config=None,
    raise_on_error=True,
):
    '''Execute a single solid in an ephemeral pipeline.

    Intended to support unit tests. Input values may be passed directly, and no pipeline need be
    specified -- an ephemeral pipeline will be constructed.

    Args:
        solid_def (SolidDefinition): The solid to execute.
        mode_def (Optional[ModeDefinition]): The mode within which to execute the solid. Use this
            if, e.g., custom resources, loggers, or executors are desired.
        input_values (Optional[Dict[str, Any]]): A dict of input names to input values, used to
            pass inputs to the solid directly. You may also use the ``environment_dict`` to
            configure any inputs that are configurable.
        environment_dict (Optional[dict]): The enviroment configuration that parameterizes this
            execution, as a dict.
        run_config (Optional[RunConfig]): Optionally specifies additional config options for
            pipeline execution.
        raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.
            Defaults to ``True``, since this is the most useful behavior in test.

    Returns:
        Union[CompositeSolidExecutionResult, SolidExecutionResult]: The result of executing the
        solid.
    '''
    check.inst_param(solid_def, 'solid_def', ISolidDefinition)
    check.opt_inst_param(mode_def, 'mode_def', ModeDefinition)
    input_values = check.opt_dict_param(input_values,
                                        'input_values',
                                        key_type=str)

    solid_defs = [solid_def]

    def create_value_solid(input_name, input_value):
        @lambda_solid(name=input_name)
        def input_solid():
            return input_value

        return input_solid

    dependencies = defaultdict(dict)

    for input_name, input_value in input_values.items():
        dependencies[solid_def.name][input_name] = DependencyDefinition(
            input_name)
        solid_defs.append(create_value_solid(input_name, input_value))

    result = execute_pipeline(
        PipelineDefinition(
            name='ephemeral_{}_solid_pipeline'.format(solid_def.name),
            solid_defs=solid_defs,
            dependencies=dependencies,
            mode_defs=[mode_def] if mode_def else None,
        ),
        environment_dict=environment_dict,
        run_config=run_config,
        raise_on_error=raise_on_error,
    )
    return result.result_for_handle(solid_def.name)
Example #14
0
 def complete(self, output):
     return CompleteCompositionContext(
         self.name, self._invocations, check.opt_dict_param(output, 'output')
     )
Example #15
0
 def __new__(cls, name: str, tags: Optional[Dict[object, object]] = None):
     return super(ExternalPartitionTagsData, cls).__new__(
         cls,
         name=check.str_param(name, "name"),
         tags=check.opt_dict_param(tags, "tags"),
     )
Example #16
0
    def __init__(
        self,
        name,
        cron_schedule,
        pipeline_name,
        environment_dict=None,
        environment_dict_fn=None,
        tags=None,
        tags_fn=None,
        solid_selection=None,
        mode="default",
        should_execute=None,
        environment_vars=None,
    ):
        check.str_param(name, 'name')
        check.str_param(cron_schedule, 'cron_schedule')
        check.str_param(pipeline_name, 'pipeline_name')
        check.opt_dict_param(environment_dict, 'environment_dict')
        check.opt_callable_param(environment_dict_fn, 'environment_dict_fn')
        check.opt_dict_param(tags, 'tags', key_type=str, value_type=str)
        check.opt_callable_param(tags_fn, 'tags_fn')
        check.opt_nullable_list_param(solid_selection,
                                      'solid_selection',
                                      of_type=str)
        mode = check.opt_str_param(mode, 'mode', DEFAULT_MODE_NAME)
        check.opt_callable_param(should_execute, 'should_execute')
        check.opt_dict_param(environment_vars,
                             'environment_vars',
                             key_type=str,
                             value_type=str)

        if environment_dict_fn and environment_dict:
            raise DagsterInvalidDefinitionError(
                'Attempted to provide both environment_dict_fn and environment_dict as arguments'
                ' to ScheduleDefinition. Must provide only one of the two.')

        if tags_fn and tags:
            raise DagsterInvalidDefinitionError(
                'Attempted to provide both tags_fn and tags as arguments'
                ' to ScheduleDefinition. Must provide only one of the two.')

        if not environment_dict and not environment_dict_fn:
            environment_dict_fn = lambda _context: {}

        if not tags and not tags_fn:
            tags_fn = lambda _context: {}

        if not should_execute:
            should_execute = lambda _context: True

        self._schedule_definition_data = ScheduleDefinitionData(
            name=check.str_param(name, 'name'),
            cron_schedule=check.str_param(cron_schedule, 'cron_schedule'),
            environment_vars=check.opt_dict_param(environment_vars,
                                                  'environment_vars'),
        )

        self._environment_dict = environment_dict
        self._environment_dict_fn = environment_dict_fn
        self._tags = tags
        self._tags_fn = tags_fn
        self._should_execute = should_execute
        self._mode = mode
        self._pipeline_name = pipeline_name
        self._solid_selection = solid_selection
Example #17
0
def test_opt_dict_param_with_type():
    str_to_int = {'str': 1}
    assert check.opt_dict_param(str_to_int,
                                'str_to_int',
                                key_type=str,
                                value_type=int)
    assert check.opt_dict_param(str_to_int, 'str_to_int', value_type=int)
    assert check.opt_dict_param(str_to_int, 'str_to_int', key_type=str)
    assert check.opt_dict_param(str_to_int, 'str_to_int')

    assert check.opt_dict_param({}, 'str_to_int', key_type=str,
                                value_type=int) == {}
    assert check.opt_dict_param({}, 'str_to_int', value_type=int) == {}
    assert check.opt_dict_param({}, 'str_to_int', key_type=str) == {}
    assert check.opt_dict_param({}, 'str_to_int') == {}

    assert check.opt_dict_param(None,
                                'str_to_int',
                                key_type=str,
                                value_type=int) == {}
    assert check.opt_dict_param(None, 'str_to_int', value_type=int) == {}
    assert check.opt_dict_param(None, 'str_to_int', key_type=str) == {}
    assert check.opt_dict_param(None, 'str_to_int') == {}

    class Wrong(object):
        pass

    with pytest.raises(CheckError):
        assert check.opt_dict_param(str_to_int,
                                    'str_to_int',
                                    key_type=Wrong,
                                    value_type=Wrong)

    with pytest.raises(CheckError):
        assert check.opt_dict_param(str_to_int,
                                    'str_to_int',
                                    key_type=Wrong,
                                    value_type=int)

    with pytest.raises(CheckError):
        assert check.opt_dict_param(str_to_int,
                                    'str_to_int',
                                    key_type=str,
                                    value_type=Wrong)

    with pytest.raises(CheckError):
        assert check.opt_dict_param(str_to_int, 'str_to_int', key_type=Wrong)

    with pytest.raises(CheckError):
        assert check.opt_dict_param(str_to_int, 'str_to_int', value_type=Wrong)
Example #18
0
def execute_script_file(shell_script_path, output_logging, log, cwd=None, env=None):
    '''Execute a shell script file specified by the argument ``shell_command``. The script will be
    invoked via ``subprocess.Popen(['bash', shell_script_path], ...)``.

    In the Popen invocation, ``stdout=PIPE, stderr=STDOUT`` is used, and the combined stdout/stderr
    output is retrieved.

    Args:
        shell_command (str): The shell command to execute
        output_logging (str): The logging mode to use. Supports STREAM, BUFFER, and NONE.
        log (Union[logging.Logger, DagsterLogManager]): Any logger which responds to .info()
        cwd (str, optional): Working directory for the shell command to use. Defaults to the
            temporary path where we store the shell command in a script file.
        env (Dict[str, str], optional): Environment dictionary to pass to ``subprocess.Popen``.
            Unused by default.

    Raises:
        Exception: When an invalid output_logging is selected. Unreachable from solid-based
            invocation since the config system will check output_logging against the config
            enum.

    Returns:
        str: The combined stdout/stderr output of running the shell script.
    '''
    check.str_param(shell_script_path, 'shell_script_path')
    check.str_param(output_logging, 'output_logging')
    check.opt_str_param(cwd, 'cwd', default=os.path.dirname(shell_script_path))
    env = check.opt_dict_param(env, 'env')

    def pre_exec():
        # Restore default signal disposition and invoke setsid
        for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'):
            if hasattr(signal, sig):
                signal.signal(getattr(signal, sig), signal.SIG_DFL)
        os.setsid()

    with open(shell_script_path, 'rb') as f:
        shell_command = six.ensure_str(f.read())

    log.info('Running command:\n{command}'.format(command=shell_command))

    # pylint: disable=subprocess-popen-preexec-fn
    sub_process = Popen(
        ['bash', shell_script_path],
        stdout=PIPE,
        stderr=STDOUT,
        cwd=cwd,
        env=env,
        preexec_fn=pre_exec,
    )

    # Will return the string result of reading stdout of the shell command
    output = ''

    if output_logging not in ['STREAM', 'BUFFER', 'NONE']:
        raise Exception('Unrecognized output_logging %s' % output_logging)

    # Stream back logs as they are emitted
    if output_logging == 'STREAM':
        for raw_line in iter(sub_process.stdout.readline, b''):
            line = six.ensure_str(raw_line)
            log.info(line.rstrip())
            output += line

    sub_process.wait()

    # Collect and buffer all logs, then emit
    if output_logging == 'BUFFER':
        output = ''.join(
            [six.ensure_str(raw_line) for raw_line in iter(sub_process.stdout.readline, b'')]
        )
        log.info(output)

    # no logging in this case
    elif output_logging == 'NONE':
        pass

    log.info('Command exited with return code {retcode}'.format(retcode=sub_process.returncode))

    return output, sub_process.returncode
Example #19
0
File: api.py Project: zkan/dagster
def execute_pipeline(pipeline,
                     environment_dict=None,
                     run_config=None,
                     instance=None,
                     raise_on_error=True):
    '''Execute a pipeline synchronously.

    Users will typically call this API when testing pipeline execution, or running standalone
    scripts.

    Parameters:
        pipeline (PipelineDefinition): The pipeline to execute.
        environment_dict (Optional[dict]): The enviroment configuration that parameterizes this run,
            as a dict.
        run_config (Optional[RunConfig]): Optionally specifies additional config options for
            pipeline execution.
        instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,
            an ephemeral instance will be used, and no artifacts will be persisted from the run.
        raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.
            Defaults to ``True``, since this is the most useful behavior in test.

    Returns:
      :py:class:`PipelineExecutionResult`: The result of pipeline execution.

    For the asynchronous version, see :py:func:`execute_pipeline_iterator`.

    This is the entrypoint for dagster CLI execution. For the dagster-graphql entrypoint, see
    ``dagster.core.execution.api.execute_plan()``.
    '''

    check.inst_param(pipeline, 'pipeline', PipelineDefinition)
    environment_dict = check.opt_dict_param(environment_dict,
                                            'environment_dict')
    run_config = check_run_config_param(run_config, pipeline)

    check.opt_inst_param(instance, 'instance', DagsterInstance)
    instance = instance or DagsterInstance.ephemeral()

    execution_plan = create_execution_plan(pipeline, environment_dict,
                                           run_config)

    pipeline_run = _create_run(instance, pipeline, run_config,
                               environment_dict)

    initialization_manager = pipeline_initialization_manager(
        pipeline,
        environment_dict,
        pipeline_run,
        instance,
        execution_plan,
        raise_on_error=raise_on_error,
    )
    event_list = list(initialization_manager.generate_setup_events())
    pipeline_context = initialization_manager.get_object()
    if pipeline_context:
        event_list.extend(
            _pipeline_execution_iterator(pipeline_context, execution_plan,
                                         pipeline_run))
    event_list.extend(initialization_manager.generate_teardown_events())
    return PipelineExecutionResult(
        pipeline,
        run_config.run_id,
        event_list,
        lambda: scoped_pipeline_context(
            pipeline,
            environment_dict,
            pipeline_run,
            instance,
            execution_plan,
            system_storage_data=SystemStorageData(
                intermediates_manager=pipeline_context.intermediates_manager,
                file_manager=pipeline_context.file_manager,
            ),
        ),
    )
Example #20
0
    def __init__(
        self,
        service_account_name,
        instance_config_map,
        postgres_password_secret=None,
        dagster_home=None,
        job_image=None,
        image_pull_policy=None,
        image_pull_secrets=None,
        load_incluster_config=True,
        kubeconfig_file=None,
        inst_data=None,
        job_namespace="default",
        env_config_maps=None,
        env_secrets=None,
        env_vars=None,
        k8s_client_batch_api=None,
        volume_mounts=None,
        volumes=None,
        labels=None,
        fail_pod_on_run_failure=None,
    ):
        self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)
        self.job_namespace = check.str_param(job_namespace, "job_namespace")

        self.load_incluster_config = load_incluster_config
        self.kubeconfig_file = kubeconfig_file
        if load_incluster_config:
            check.invariant(
                kubeconfig_file is None,
                "`kubeconfig_file` is set but `load_incluster_config` is True.",
            )
            kubernetes.config.load_incluster_config()
        else:
            check.opt_str_param(kubeconfig_file, "kubeconfig_file")
            kubernetes.config.load_kube_config(kubeconfig_file)

        self._fixed_batch_api = k8s_client_batch_api

        self._job_config = None
        self._job_image = check.opt_str_param(job_image, "job_image")
        self.dagster_home = check.str_param(dagster_home, "dagster_home")
        self._image_pull_policy = check.opt_str_param(
            image_pull_policy, "image_pull_policy", "IfNotPresent"
        )
        self._image_pull_secrets = check.opt_list_param(
            image_pull_secrets, "image_pull_secrets", of_type=dict
        )
        self._service_account_name = check.str_param(service_account_name, "service_account_name")
        self.instance_config_map = check.str_param(instance_config_map, "instance_config_map")
        self.postgres_password_secret = check.opt_str_param(
            postgres_password_secret, "postgres_password_secret"
        )
        self._env_config_maps = check.opt_list_param(
            env_config_maps, "env_config_maps", of_type=str
        )
        self._env_secrets = check.opt_list_param(env_secrets, "env_secrets", of_type=str)
        self._env_vars = check.opt_list_param(env_vars, "env_vars", of_type=str)
        self._volume_mounts = check.opt_list_param(volume_mounts, "volume_mounts")
        self._volumes = check.opt_list_param(volumes, "volumes")
        self._labels = check.opt_dict_param(labels, "labels", key_type=str, value_type=str)
        self._fail_pod_on_run_failure = check.opt_bool_param(
            fail_pod_on_run_failure, "fail_pod_on_run_failure"
        )

        super().__init__()
Example #21
0
def test_opt_dict_param():
    assert check.opt_dict_param(None, "opt_dict_param") == {}
    assert check.opt_dict_param({}, "opt_dict_param") == {}
    assert check.opt_dict_param(frozendict(), "opt_dict_param") == {}
    ddict = {"a": 2}
    assert check.opt_dict_param(ddict, "opt_dict_param") == ddict

    with pytest.raises(ParameterCheckError):
        check.opt_dict_param(0, "opt_dict_param")

    with pytest.raises(ParameterCheckError):
        check.opt_dict_param(1, "opt_dict_param")

    with pytest.raises(ParameterCheckError):
        check.opt_dict_param("foo", "opt_dict_param")

    with pytest.raises(ParameterCheckError):
        check.opt_dict_param(["foo"], "opt_dict_param")

    with pytest.raises(ParameterCheckError):
        check.opt_dict_param([], "opt_dict_param")
Example #22
0
 def __init__(self, cluster_type, cluster_configuration):
     self.cluster_type = check.opt_str_param(cluster_type,
                                             'cluster_type',
                                             default='local')
     self.cluster_configuration = check.opt_dict_param(
         cluster_configuration, 'cluster_configuration')
def minute_schedule(
    pipeline_name,
    start_date,
    cron_schedule="* * * * *",
    name=None,
    tags_fn_for_date=None,
    solid_selection=None,
    should_execute=None,
    environment_vars=None,
    end_date=None,
    execution_timezone=None,
):
    """Create a schedule that runs every minute.
    The decorated function will be called as the ``run_config_fn`` of the underlying
    `ScheduleDefinition` and should take a `ScheduleExecutionContext` as its only argument, 
    returning the environment dict for the scheduled execution.
    Args:
        pipeline_name (str): The name of the pipeline to execute when the schedule runs.
        start_date (datetime.datetime): The date from which to run the schedule.
        name (Optional[str]): The name of the schedule to create. By default, this will be the name
            of the decorated function.
        tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A
            function that generates tags to attach to the schedules runs. Takes the date of the
            schedule run and returns a dictionary of tags (string key-value pairs).
        solid_selection (Optional[List[str]]): A list of solid subselection (including single
            solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``
        mode (Optional[str]): The pipeline mode in which to execute this schedule.
            (Default: 'default')
        should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at
            schedule execution tie to determine whether a schedule should execute or skip. Takes a
            :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the
            schedule should execute). Defaults to a function that always returns ``True``.
        environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing
            the schedule.
        end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to
            current time.
        execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works
            with DagsterDaemonScheduler, and must be set when using that scheduler.
    """
    check.opt_str_param(name, "name")
    check.inst_param(start_date, "start_date", datetime.datetime)
    check.opt_inst_param(end_date, "end_date", datetime.datetime)
    check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")
    check.opt_nullable_list_param(solid_selection,
                                  "solid_selection",
                                  of_type=str)
    mode = "default"
    check.opt_callable_param(should_execute, "should_execute")
    check.opt_dict_param(environment_vars,
                         "environment_vars",
                         key_type=str,
                         value_type=str)
    check.str_param(pipeline_name, "pipeline_name")
    check.opt_str_param(execution_timezone, "execution_timezone")

    if start_date.second != 0:
        warnings.warn(
            "`start_date` must be at the beginning of the minute for a per minute schedule. "
        )

    fmt = (DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE
           if execution_timezone else DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE)

    execution_time_to_partition_fn = lambda d: pendulum.instance(d).subtract(
        minutes=1)

    partition_fn = schedule_partition_range(
        start_date,
        end=end_date,
        cron_schedule=cron_schedule,
        fmt=fmt,
        timezone=execution_timezone,
        execution_time_to_partition_fn=execution_time_to_partition_fn,
    )

    def inner(fn):
        check.callable_param(fn, "fn")

        schedule_name = name or fn.__name__

        tags_fn_for_partition_value = lambda partition: {}
        if tags_fn_for_date:
            tags_fn_for_partition_value = lambda partition: tags_fn_for_date(
                partition.value)

        partition_set = PartitionSetDefinition(
            name="{}_partitions".format(schedule_name),
            pipeline_name=pipeline_name,
            partition_fn=partition_fn,
            run_config_fn_for_partition=lambda partition: fn(partition.value),
            solid_selection=solid_selection,
            tags_fn_for_partition=tags_fn_for_partition_value,
            mode=mode,
        )

        return partition_set.create_schedule_definition(
            schedule_name,
            cron_schedule,
            should_execute=should_execute,
            environment_vars=environment_vars,
            partition_selector=create_default_partition_selector_fn(
                delta_fn=execution_time_to_partition_fn,
                fmt=fmt,
            ),
            execution_timezone=execution_timezone,
        )

    return inner
Example #24
0
    def create_schedule_definition(
        self,
        schedule_name,
        cron_schedule,
        should_execute=None,
        partition_selector=last_partition,
        environment_vars=None,
        execution_timezone=None,
    ):
        """Create a ScheduleDefinition from a PartitionSetDefinition.

        Arguments:
            schedule_name (str): The name of the schedule.
            cron_schedule (str): A valid cron string for the schedule
            should_execute (Optional[function]): Function that runs at schedule execution time that
            determines whether a schedule should execute. Defaults to a function that always returns
            ``True``.
            partition_selector (Callable[ScheduleExecutionContext, PartitionSetDefinition],
            Partition): A partition selector for the schedule.
            environment_vars (Optional[dict]): The environment variables to set for the schedule.
            execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works
                with DagsterDaemonScheduler, and must be set when using that scheduler.

        Returns:
            ScheduleDefinition: The generated ScheduleDefinition for the partition selector
        """

        check.str_param(schedule_name, "schedule_name")
        check.str_param(cron_schedule, "cron_schedule")
        check.opt_callable_param(should_execute, "should_execute")
        check.opt_dict_param(environment_vars,
                             "environment_vars",
                             key_type=str,
                             value_type=str)
        check.callable_param(partition_selector, "partition_selector")
        check.opt_str_param(execution_timezone, "execution_timezone")

        def _should_execute_wrapper(context):
            check.inst_param(context, "context", ScheduleExecutionContext)
            selected_partition = partition_selector(context, self)

            if not selected_partition or not selected_partition.name in self.get_partition_names(
            ):
                return False
            elif not should_execute:
                return True
            else:
                return should_execute(context)

        def _run_config_fn_wrapper(context):
            check.inst_param(context, "context", ScheduleExecutionContext)
            selected_partition = partition_selector(context, self)
            if not selected_partition or not selected_partition.name in self.get_partition_names(
            ):
                raise DagsterInvariantViolationError(
                    "The partition selection function `{selector}` did not return "
                    "a partition from PartitionSet {partition_set}".format(
                        selector=getattr(partition_selector, "__name__",
                                         repr(partition_selector)),
                        partition_set=self.name,
                    ))

            return self.run_config_for_partition(selected_partition)

        def _tags_fn_wrapper(context):
            check.inst_param(context, "context", ScheduleExecutionContext)
            selected_partition = partition_selector(context, self)
            if not selected_partition:
                raise DagsterInvariantViolationError(
                    "The partition selection function `{selector}` did not return "
                    "a partition from PartitionSet {partition_set}".format(
                        selector=getattr(partition_selector, "__name__",
                                         repr(partition_selector)),
                        partition_set=self.name,
                    ))

            return self.tags_for_partition(selected_partition)

        return PartitionScheduleDefinition(
            name=schedule_name,
            cron_schedule=cron_schedule,
            pipeline_name=self.pipeline_name,
            run_config_fn=_run_config_fn_wrapper,
            tags_fn=_tags_fn_wrapper,
            solid_selection=self.solid_selection,
            mode=self.mode,
            should_execute=_should_execute_wrapper,
            environment_vars=environment_vars,
            partition_set=self,
            execution_timezone=execution_timezone,
        )
Example #25
0
def hourly_schedule(
    pipeline_name,
    start_date,
    name=None,
    execution_time=datetime.time(0, 0),
    tags_fn_for_date=None,
    solid_selection=None,
    mode="default",
    should_execute=None,
    environment_vars=None,
    end_date=None,
):
    '''Create a schedule that runs hourly.

    The decorated function will be called as the ``run_config_fn`` of the underlying
    :py:class:`~dagster.ScheduleDefinition` and should take a
    :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the environment
    dict for the scheduled execution.

    Args:
        pipeline_name (str): The name of the pipeline to execute when the schedule runs.
        start_date (datetime.datetime): The date from which to run the schedule.
        name (Optional[str]): The name of the schedule to create. By default, this will be the name
            of the decorated function.
        execution_time (datetime.time): The time at which to execute the schedule. Only the minutes
            component will be respected -- the hour should be 0, and will be ignored if it is not 0.
        tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A
            function that generates tags to attach to the schedules runs. Takes the date of the
            schedule run and returns a dictionary of tags (string key-value pairs).
        solid_selection (Optional[List[str]]): A list of solid subselection (including single
            solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``
        mode (Optional[str]): The pipeline mode in which to execute this schedule.
            (Default: 'default')
        should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at
            schedule execution tie to determine whether a schedule should execute or skip. Takes a
            :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the
            schedule should execute). Defaults to a function that always returns ``True``.
        environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing
            the schedule.
        end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to
            current time.
    '''
    check.opt_str_param(name, 'name')
    check.inst_param(start_date, 'start_date', datetime.datetime)
    check.opt_inst_param(end_date, 'end_date', datetime.datetime)
    check.opt_callable_param(tags_fn_for_date, 'tags_fn_for_date')
    check.opt_nullable_list_param(solid_selection,
                                  'solid_selection',
                                  of_type=str)
    mode = check.opt_str_param(mode, 'mode', DEFAULT_MODE_NAME)
    check.opt_callable_param(should_execute, 'should_execute')
    check.opt_dict_param(environment_vars,
                         'environment_vars',
                         key_type=str,
                         value_type=str)
    check.str_param(pipeline_name, 'pipeline_name')
    check.inst_param(execution_time, 'execution_time', datetime.time)

    if execution_time.hour != 0:
        warnings.warn(
            "Hourly schedule {schedule_name} created with:\n"
            "\tschedule_time=datetime.time(hour={hour}, minute={minute}, ...)."
            "Since this is a hourly schedule, the hour parameter will be ignored and the schedule "
            "will run on the {minute} mark for the previous hour interval. Replace "
            "datetime.time(hour={hour}, minute={minute}, ...) with "
            "datetime.time(minute={minute}, ...) to fix this warning.")

    cron_schedule = '{minute} * * * *'.format(minute=execution_time.minute)

    partition_fn = date_partition_range(start_date,
                                        end=end_date,
                                        delta=datetime.timedelta(hours=1),
                                        fmt="%Y-%m-%d-%H:%M")

    def inner(fn):
        check.callable_param(fn, 'fn')

        schedule_name = name or fn.__name__

        tags_fn_for_partition_value = lambda partition: {}
        if tags_fn_for_date:
            tags_fn_for_partition_value = lambda partition: tags_fn_for_date(
                partition.value)

        partition_set = PartitionSetDefinition(
            name='{}_partitions'.format(schedule_name),
            pipeline_name=pipeline_name,
            partition_fn=partition_fn,
            run_config_fn_for_partition=lambda partition: fn(partition.value),
            solid_selection=solid_selection,
            tags_fn_for_partition=tags_fn_for_partition_value,
            mode=mode,
        )

        return partition_set.create_schedule_definition(
            schedule_name,
            cron_schedule,
            should_execute=should_execute,
            environment_vars=environment_vars,
        )

    return inner
Example #26
0
def create_execution_plan(pipeline, environment_dict=None):
    check.inst_param(pipeline, 'pipeline', PipelineDefinition)
    environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str)
    environment_config = create_environment_config(pipeline, environment_dict)
    return create_execution_plan_core(pipeline, environment_config)
Example #27
0
def make_dagster_pipeline_from_airflow_dag(dag,
                                           tags=None,
                                           use_airflow_template_context=False,
                                           unique_id=None):
    """Construct a Dagster pipeline corresponding to a given Airflow DAG.

    Tasks in the resulting pipeline will execute the ``execute()`` method on the corresponding
    Airflow Operator. Dagster, any dependencies required by Airflow Operators, and the module
    containing your DAG definition must be available in the Python environment within which your
    Dagster solids execute.

    To set Airflow's ``execution_date`` for use with Airflow Operator's ``execute()`` methods,
    either:

    1. (Best for ad hoc runs) Run Pipeline with 'default' preset, which sets execution_date to the
        time (in UTC) of pipeline invocation:

        .. code-block:: python

            execute_pipeline(
                pipeline=make_dagster_pipeline_from_airflow_dag(dag=dag),
                preset='default')

    2. Add ``{'airflow_execution_date': utc_date_string}`` to the PipelineDefinition tags. This will
       override behavior from (1).

        .. code-block:: python

            execute_pipeline(
                make_dagster_pipeline_from_airflow_dag(
                    dag=dag,
                    tags={'airflow_execution_date': utc_execution_date_str}
                )
            )

    3. (Recommended) Add ``{'airflow_execution_date': utc_date_string}`` to the PipelineRun tags,
        such as in the Dagit UI. This will override behavior from (1) and (2)


    We apply normalized_name() to the dag id and task ids when generating pipeline name and solid
    names to ensure that names conform to Dagster's naming conventions.

    Args:
        dag (DAG): The Airflow DAG to compile into a Dagster pipeline
        tags (Dict[str, Field]): Pipeline tags. Optionally include
            `tags={'airflow_execution_date': utc_date_string}` to specify execution_date used within
            execution of Airflow Operators.
        use_airflow_template_context (bool): If True, will call get_template_context() on the
            Airflow TaskInstance model which requires and modifies the DagRun table.
            (default: False)
        unique_id (int): If not None, this id will be postpended to generated solid names. Used by
            framework authors to enforce unique solid names within a repo.

    Returns:
        pipeline_def (PipelineDefinition): The generated Dagster pipeline

    """
    check.inst_param(dag, "dag", DAG)
    tags = check.opt_dict_param(tags, "tags")
    check.bool_param(use_airflow_template_context,
                     "use_airflow_template_context")
    unique_id = check.opt_int_param(unique_id, "unique_id")

    if IS_AIRFLOW_INGEST_PIPELINE_STR not in tags:
        tags[IS_AIRFLOW_INGEST_PIPELINE_STR] = "true"

    tags = validate_tags(tags)

    pipeline_dependencies, solid_defs = _get_pipeline_definition_args(
        dag, use_airflow_template_context, unique_id)
    pipeline_def = PipelineDefinition(
        name=normalized_name(dag.dag_id, None),
        solid_defs=solid_defs,
        dependencies=pipeline_dependencies,
        tags=tags,
    )
    return pipeline_def
Example #28
0
    def __init__(
        self,
        task_id,
        environment_dict=None,
        pipeline_name=None,
        mode=None,
        step_keys=None,
        dag=None,
        instance_ref=None,
        *args,
        **kwargs
    ):
        check.str_param(pipeline_name, 'pipeline_name')
        step_keys = check.opt_list_param(step_keys, 'step_keys', of_type=str)
        environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str)
        check.opt_inst_param(instance_ref, 'instance_ref', InstanceRef)

        kwargs['name'] = 'dagster.{pipeline_name}.{task_id}'.format(
            pipeline_name=pipeline_name, task_id=task_id
        ).replace(
            '_', '-'  # underscores are not permissible DNS names
        )

        if 'storage' not in environment_dict:
            raise AirflowException(
                'No storage config found -- must configure either filesystem or s3 storage for '
                'the DagsterKubernetesPodOperator. Ex.: \n'
                'storage:\n'
                '  filesystem:\n'
                '    base_dir: \'/some/shared/volume/mount/special_place\''
                '\n\n --or--\n\n'
                'storage:\n'
                '  s3:\n'
                '    s3_bucket: \'my-s3-bucket\'\n'
            )

        check.invariant(
            'in_memory' not in environment_dict.get('storage', {}),
            'Cannot use in-memory storage with Airflow, must use S3',
        )

        self.environment_dict = environment_dict
        self.pipeline_name = pipeline_name
        self.mode = mode
        self.step_keys = step_keys
        self._run_id = None
        # self.instance might be None in, for instance, a unit test setting where the operator
        # was being directly instantiated without passing through make_airflow_dag
        self.instance = DagsterInstance.from_ref(instance_ref) if instance_ref else None

        # Store Airflow DAG run timestamp so that we can pass along via execution metadata
        self.airflow_ts = kwargs.get('ts')

        # Add AWS creds
        self.env_vars = kwargs.get('env_vars', {})
        for k, v in get_aws_environment().items():
            self.env_vars.setdefault(k, v)

        kwargs.setdefault('labels', {})
        kwargs['labels'].setdefault('dagster_pipeline', self.pipeline_name)
        kwargs['labels'].setdefault('app.kubernetes.io/name', 'dagster')
        kwargs['labels'].setdefault('app.kubernetes.io/instance', self.pipeline_name)
        kwargs['labels'].setdefault('app.kubernetes.io/version', dagster_version)
        kwargs['labels'].setdefault('app.kubernetes.io/component', 'pipeline-execution')
        kwargs['labels'].setdefault('app.kubernetes.io/part-of', 'dagster-airflow')
        kwargs['labels'].setdefault('app.kubernetes.io/managed-by', 'dagster-airflow')

        # The xcom mechanism for the pod operator is very unlike that of the Docker operator, so
        # we disable it
        if 'xcom_push' in kwargs:
            self.log.warning(
                'xcom_push cannot be enabled with the DagsterKubernetesPodOperator, disabling'
            )
        kwargs['xcom_push'] = False

        super(DagsterKubernetesPodOperator, self).__init__(
            task_id=task_id, dag=dag, *args, **kwargs
        )
Example #29
0
def hourly_schedule(
    pipeline_name: str,
    start_date: datetime.datetime,
    name: Optional[str] = None,
    execution_time: datetime.time = datetime.time(0, 0),
    tags_fn_for_date: Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]] = None,
    solid_selection: Optional[List[str]] = None,
    mode: Optional[str] = "default",
    should_execute: Optional[Callable[["ScheduleExecutionContext"], bool]] = None,
    environment_vars: Optional[Dict[str, str]] = None,
    end_date: Optional[str] = None,
    execution_timezone: Optional[str] = None,
    partition_hours_offset: Optional[int] = 1,
    description: Optional[str] = None,
) -> Callable[[Callable[[datetime.datetime], Dict[str, Any]]], PartitionScheduleDefinition]:
    """Create a partitioned schedule that runs hourly.

    The decorated function should accept a datetime object as its only argument. The datetime
    represents the date partition that it's meant to run on.

    The decorated function should return a run configuration dictionary, which will be used as
    configuration for the scheduled run.

    The decorator produces a :py:class:`~dagster.PartitionScheduleDefinition`.

    Args:
        pipeline_name (str): The name of the pipeline to execute when the schedule runs.
        start_date (datetime.datetime): The date from which to run the schedule.
        name (Optional[str]): The name of the schedule to create. By default, this will be the name
            of the decorated function.
        execution_time (datetime.time): The time at which to execute the schedule. Only the minutes
            component will be respected -- the hour should be 0, and will be ignored if it is not 0.
        tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A
            function that generates tags to attach to the schedules runs. Takes the date of the
            schedule run and returns a dictionary of tags (string key-value pairs).
        solid_selection (Optional[List[str]]): A list of solid subselection (including single
            solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``
        mode (Optional[str]): The pipeline mode in which to execute this schedule.
            (Default: 'default')
        should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at
            schedule execution tie to determine whether a schedule should execute or skip. Takes a
            :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the
            schedule should execute). Defaults to a function that always returns ``True``.
        environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing
            the schedule.
        end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to
            current time.
        execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works
            with DagsterDaemonScheduler, and must be set when using that scheduler.
        partition_hours_offset (Optional[int]): How many hours back to go when choosing the partition
            for a given schedule execution. For example, when partition_hours_offset=1, the schedule
            that executes during hour N will fill in the partition for hour N-1.
            (Default: 1)
        description (Optional[str]): A human-readable description of the schedule.
    """
    check.opt_str_param(name, "name")
    check.inst_param(start_date, "start_date", datetime.datetime)
    check.opt_inst_param(end_date, "end_date", datetime.datetime)
    check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")
    check.opt_nullable_list_param(solid_selection, "solid_selection", of_type=str)
    mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)
    check.opt_callable_param(should_execute, "should_execute")
    check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str)
    check.str_param(pipeline_name, "pipeline_name")
    check.inst_param(execution_time, "execution_time", datetime.time)
    check.opt_str_param(execution_timezone, "execution_timezone")
    check.opt_int_param(partition_hours_offset, "partition_hours_offset")
    check.opt_str_param(description, "description")

    if start_date.minute != 0 or start_date.second != 0:
        warnings.warn(
            "`start_date` must be at the beginning of the hour for an hourly schedule. "
            "Use `execution_time` to execute the schedule at a specific time within the hour. For "
            "example, to run the schedule each hour at 15 minutes past the hour starting at 3AM "
            "on 10/20/2020, your schedule definition would look like:"
            """
@hourly_schedule(
    start_date=datetime.datetime(2020, 10, 20, 3),
    execution_time=datetime.time(0, 15)
):
def my_schedule_definition(_):
    ...
"""
        )

    if execution_time.hour != 0:
        warnings.warn(
            "Hourly schedule {schedule_name} created with:\n"
            "\tschedule_time=datetime.time(hour={hour}, minute={minute}, ...)."
            "Since this is an hourly schedule, the hour parameter will be ignored and the schedule "
            "will run on the {minute} mark for the previous hour interval. Replace "
            "datetime.time(hour={hour}, minute={minute}, ...) with "
            "datetime.time(minute={minute}, ...) to fix this warning."
        )

    cron_schedule = "{minute} * * * *".format(minute=execution_time.minute)

    fmt = (
        DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE
        if execution_timezone
        else DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE
    )

    execution_time_to_partition_fn = lambda d: pendulum.instance(d).subtract(
        hours=partition_hours_offset, minutes=(execution_time.minute - start_date.minute) % 60
    )

    partition_fn = schedule_partition_range(
        start_date,
        end=end_date,
        cron_schedule=cron_schedule,
        fmt=fmt,
        timezone=execution_timezone,
        execution_time_to_partition_fn=execution_time_to_partition_fn,
        inclusive=(partition_hours_offset == 0),
    )

    def inner(fn: Callable[[datetime.datetime], Dict[str, Any]]) -> PartitionScheduleDefinition:
        check.callable_param(fn, "fn")

        schedule_name = name or fn.__name__

        tags_fn_for_partition_value: Callable[
            ["Partition"], Optional[Dict[str, str]]
        ] = lambda partition: {}
        if tags_fn_for_date:
            tags_fn = cast(
                Callable[[datetime.datetime], Optional[Dict[str, str]]], tags_fn_for_date
            )
            tags_fn_for_partition_value = lambda partition: tags_fn(partition.value)

        partition_set = PartitionSetDefinition(
            name="{}_partitions".format(schedule_name),
            pipeline_name=pipeline_name,
            partition_fn=partition_fn,
            run_config_fn_for_partition=lambda partition: fn(partition.value),
            solid_selection=solid_selection,
            tags_fn_for_partition=tags_fn_for_partition_value,
            mode=mode,
        )

        return partition_set.create_schedule_definition(
            schedule_name,
            cron_schedule,
            should_execute=should_execute,
            environment_vars=environment_vars,
            partition_selector=create_offset_partition_selector(
                execution_time_to_partition_fn=execution_time_to_partition_fn,
            ),
            execution_timezone=execution_timezone,
            description=description,
        )

    return inner
Example #30
0
def daily_schedule(
    pipeline_name,
    start_date,
    name=None,
    execution_time=datetime.time(0, 0),
    tags_fn_for_date=None,
    solid_selection=None,
    mode="default",
    should_execute=None,
    environment_vars=None,
    end_date=None,
):
    """Create a schedule that runs daily.

    The decorated function will be called as the ``run_config_fn`` of the underlying
    :py:class:`~dagster.ScheduleDefinition` and should take a
    :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the environment
    dict for the scheduled execution.

    Args:
        pipeline_name (str): The name of the pipeline to execute when the schedule runs.
        start_date (datetime.datetime): The date from which to run the schedule.
        name (Optional[str]): The name of the schedule to create.
        execution_time (datetime.time): The time at which to execute the schedule.
        tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A
            function that generates tags to attach to the schedules runs. Takes the date of the
            schedule run and returns a dictionary of tags (string key-value pairs).
        solid_selection (Optional[List[str]]): A list of solid subselection (including single
            solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``
        mode (Optional[str]): The pipeline mode in which to execute this schedule.
            (Default: 'default')
        should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at
            schedule execution tie to determine whether a schedule should execute or skip. Takes a
            :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the
            schedule should execute). Defaults to a function that always returns ``True``.
        environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing
            the schedule.
        end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to
            current time.
    """
    check.opt_str_param(name, "name")
    check.inst_param(start_date, "start_date", datetime.datetime)
    check.opt_inst_param(end_date, "end_date", datetime.datetime)
    check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")
    check.opt_nullable_list_param(solid_selection,
                                  "solid_selection",
                                  of_type=str)
    mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)
    check.opt_callable_param(should_execute, "should_execute")
    check.opt_dict_param(environment_vars,
                         "environment_vars",
                         key_type=str,
                         value_type=str)
    check.str_param(pipeline_name, "pipeline_name")
    check.inst_param(execution_time, "execution_time", datetime.time)

    cron_schedule = "{minute} {hour} * * *".format(
        minute=execution_time.minute, hour=execution_time.hour)

    partition_fn = date_partition_range(start_date, end=end_date)

    def inner(fn):
        check.callable_param(fn, "fn")

        schedule_name = name or fn.__name__

        tags_fn_for_partition_value = lambda partition: {}
        if tags_fn_for_date:
            tags_fn_for_partition_value = lambda partition: tags_fn_for_date(
                partition.value)

        partition_set = PartitionSetDefinition(
            name="{}_partitions".format(schedule_name),
            pipeline_name=pipeline_name,
            partition_fn=partition_fn,
            run_config_fn_for_partition=lambda partition: fn(partition.value),
            solid_selection=solid_selection,
            tags_fn_for_partition=tags_fn_for_partition_value,
            mode=mode,
        )

        return partition_set.create_schedule_definition(
            schedule_name,
            cron_schedule,
            should_execute=should_execute,
            environment_vars=environment_vars,
        )

    return inner