Ejemplo n.º 1
0
def test_basic_even_type():
    # start_test_basic_even_type
    EvenDagsterType = DagsterType(
        name="EvenDagsterType",
        type_check_fn=lambda _, value: isinstance(value, int) and value % 2 is
        0,
    )
    # end_test_basic_even_type

    # start_test_basic_even_type_with_annotations
    @solid
    def double_even(_, num: EvenDagsterType) -> EvenDagsterType:
        # These type annotations are a shorthand for constructing InputDefinitions
        # and OutputDefinitions, and are not mypy compliant
        return num  # at runtime this is a python int

    # end_test_basic_even_type_with_annotations

    assert execute_solid(double_even, input_values={"num": 2}).success

    with pytest.raises(DagsterTypeCheckDidNotPass):
        execute_solid(double_even, input_values={"num": 3})

    assert not execute_solid(
        double_even, input_values={
            "num": 3
        }, raise_on_error=False).success
Ejemplo n.º 2
0
def pandera_schema_to_dagster_type(schema, name, description):
    def type_check_fn(_context, value):
        if not isinstance(value, pd.DataFrame):
            return TypeCheck(
                success=False,
                description=f"Must be pandas.DataFrame, not {type(value).__name__}.",
            )
        try:
            # `lazy` instructs pandera to capture every (not just the first) validation error
            schema.validate(value, lazy=True)
        except pa.errors.SchemaErrors as e:
            return TypeCheck(
                success=False,
                description=str(e),
                metadata={
                    "num_violations": len(e.failure_cases),
                },
            )

        return TypeCheck(success=True)

    return DagsterType(
        type_check_fn=type_check_fn,
        name=name,
        description=description,
    )
Ejemplo n.º 3
0
 def __init__(self, storage_key, path, computation):
     self._storage_key = check.str_param(storage_key, "storage_key")
     self._path = canonicalize_path(path)
     self._computation = check.opt_inst_param(computation, "computation",
                                              Computation)
     self._dagster_type = DagsterType(type_check_fn=lambda a, b: True,
                                      name=".".join(self.path))
Ejemplo n.º 4
0
def test_basic_even_type_no_annotations():
    EvenDagsterType = DagsterType(
        name="EvenDagsterType",
        type_check_fn=lambda _, value: isinstance(value, int) and value % 2 is
        0,
    )

    # start_test_basic_even_type_no_annotations
    @solid(
        input_defs=[InputDefinition("num", EvenDagsterType)],
        output_defs=[OutputDefinition(EvenDagsterType)],
    )
    def double_even(_, num):
        return num

    # end_test_basic_even_type_no_annotations

    assert execute_solid(double_even, input_values={"num": 2}).success

    with pytest.raises(DagsterTypeCheckDidNotPass):
        execute_solid(double_even, input_values={"num": 3})

    assert not execute_solid(
        double_even, input_values={
            "num": 3
        }, raise_on_error=False).success
Ejemplo n.º 5
0
def test_inner_inputs_connected_to_nested_outer_dependency():
    my_dagster_type = DagsterType(name="foo", type_check_fn=lambda _, _a: True)

    @solid(input_defs=[InputDefinition("data", my_dagster_type)])
    def inner_solid(data):
        return data

    @composite_solid(input_defs=[InputDefinition("data_1", my_dagster_type)])
    def inner_composite(data_1):
        # source output handle should be top_level solid
        return inner_solid(data_1)

    @composite_solid(input_defs=[InputDefinition("data_2", my_dagster_type)])
    def middle_composite(data_2):
        return inner_composite(data_2)

    @composite_solid(input_defs=[InputDefinition("data_3", my_dagster_type)])
    def outer_composite(data_3):
        return middle_composite(data_3)

    @solid
    def top_level_solid():
        return "from top_level_solid"

    @pipeline
    def my_pipeline():
        # inner_solid should be connected to top_level_solid
        outer_composite(top_level_solid())

    result = execute_pipeline(my_pipeline)
    assert result.success
    assert (result.output_for_solid(
        "outer_composite.middle_composite.inner_composite.inner_solid") ==
            "from top_level_solid")
Ejemplo n.º 6
0
 def __init__(self, storage_key, path, computation):
     self._storage_key = check.str_param(storage_key, 'storage_key')
     self._path = check.tuple_param(path, 'path', of_type=str)
     self._computation = check.opt_inst_param(computation, 'computation',
                                              Computation)
     self._dagster_type = DagsterType(type_check_fn=lambda a, b: True,
                                      name='.'.join(self.path))
Ejemplo n.º 7
0
def test_type_materializer_and_configurable_output_manager():
    @dagster_type_materializer(config_schema={"type_materializer_path": str})
    def my_materializer(_, _config, _value):
        assert False, "shouldn't get here"

    adict = {}

    @output_manager(output_config_schema={"output_manager_path": str})
    def my_output_manager(_context, _resource_config, obj):
        adict["result"] = obj

    my_type = DagsterType(lambda _, _val: True, name="my_type", materializer=my_materializer)

    @solid(
        output_defs=[
            OutputDefinition(name="output1", manager_key="my_output_manager", dagster_type=my_type),
            OutputDefinition(name="output2", dagster_type=my_type),
        ]
    )
    def my_solid(_):
        yield Output(5, "output1")
        yield Output(7, "output2")

    @pipeline(mode_defs=[ModeDefinition(resource_defs={"my_output_manager": my_output_manager})])
    def my_pipeline():
        my_solid()

    execute_pipeline(
        my_pipeline,
        run_config={"solids": {"my_solid": {"outputs": {"output1": {"output_manager_path": "a"}}}}},
    )

    assert adict["result"] == 5
Ejemplo n.º 8
0
def pandera_schema_to_dagster_type(
    schema: Union[pa.DataFrameSchema, Type[pa.SchemaModel]],
) -> DagsterType:
    """
    Convert a Pandera dataframe schema to a `DagsterType`.

    The generated Dagster type will be given an automatically generated `name`. The schema's `title`
    property, `name` property, or class name (in that order) will be used. If neither `title` or
    `name` is defined, a name of the form `DagsterPanderaDataframe<n>` is generated.

    Additional metadata is also extracted from the Pandera schema and attached to the returned
    `DagsterType` in an `MetadataEntry` object. The extracted metadata includes:

    - Descriptions on the schema and constituent columns and checks.
    - Data types for each column.
    - String representations of all column-wise checks.
    - String representations of all row-wise (i.e. "wide") checks.

    The returned `DagsterType` type will call the Pandera schema's `validate()` method in its type
    check function. Validation is done in `lazy` mode, i.e. pandera will attempt to validate all
    values in the dataframe, rather than stopping on the first error.

    If validation fails, the returned `TypeCheck` object will contain two pieces of metadata:

    - `num_failures` total number of validation errors.
    - `failure_sample` a table containing up to the first 10 validation errors.

    Args:
        schema (Union[pa.DataFrameSchema, Type[pa.SchemaModel]]):

    Returns:
        DagsterType: Dagster Type constructed from the Pandera schema.

    """
    if not (
        isinstance(schema, pa.DataFrameSchema)
        or (isinstance(schema, type) and issubclass(schema, pa.SchemaModel))
    ):
        raise TypeError(
            "schema must be a pandera `DataFrameSchema` or a subclass of a pandera `SchemaModel`"
        )

    name = _extract_name_from_pandera_schema(schema)
    norm_schema = (
        schema.to_schema()  # type: ignore[attr-defined]
        if isinstance(schema, type) and issubclass(schema, pa.SchemaModel)
        else schema
    )
    tschema = _pandera_schema_to_table_schema(norm_schema)
    type_check_fn = _pandera_schema_to_type_check_fn(norm_schema, tschema)

    return DagsterType(
        type_check_fn=type_check_fn,
        name=name,
        description=norm_schema.description,
        metadata_entries=[
            MetadataEntry("schema", value=tschema),
        ],
    )
Ejemplo n.º 9
0
 def as_dagster_type(self, *args, **kwargs):
     if self.raise_or_typecheck:
         raise Exception(
             "Dagster types can only be constructed from constraints that return typechecks"
         )
     return DagsterType(
         name=self.name,
         description="A Pandas DataFrame with the following validation: {}".
         format(self.description),
         type_check_fn=lambda x: self.validate(x, *args),
         **kwargs)
Ejemplo n.º 10
0
def test_register_after_solid_definition():
    class MyClass:
        pass

    @solid
    def _my_solid(_) -> MyClass:
        return MyClass()

    my_dagster_type = DagsterType(name="aaaa",
                                  type_check_fn=lambda _, _a: True)

    with pytest.raises(DagsterInvalidDefinitionError):
        make_python_type_usable_as_dagster_type(MyClass, my_dagster_type)
Ejemplo n.º 11
0
def test_basic_even_type():
    EvenDagsterType = DagsterType(
        name="EvenDagsterType",
        type_check_fn=lambda _, value: isinstance(value, int) and value % 2 is 0,
    )

    @solid
    def double_even(_, num: EvenDagsterType) -> EvenDagsterType:
        return num  # at runtime this is a python int

    assert execute_solid(double_even, input_values={"num": 2}).success

    with pytest.raises(DagsterTypeCheckDidNotPass):
        execute_solid(double_even, input_values={"num": 3})

    assert not execute_solid(double_even, input_values={"num": 3}, raise_on_error=False).success
Ejemplo n.º 12
0
def test_type_missing_resource_fails():
    def resource_based_type_check(context, value):
        return context.resources.a == value

    CustomType = DagsterType(
        name="NeedsA",
        type_check_fn=resource_based_type_check,
        required_resource_keys={"a"},
    )

    @solid(output_defs=[OutputDefinition(CustomType, "custom_type")])
    def custom_type_solid(_):
        return "A"

    with pytest.raises(DagsterInvalidDefinitionError,
                       match='required by type "NeedsA"'):

        @pipeline
        def _type_check_pipeline():
            custom_type_solid()
Ejemplo n.º 13
0
def create_dagster_pandas_dataframe_type(name=None,
                                         description=None,
                                         columns=None,
                                         event_metadata_fn=None,
                                         dataframe_constraints=None):
    event_metadata_fn = check.opt_callable_param(event_metadata_fn,
                                                 'event_metadata_fn')
    description = create_dagster_pandas_dataframe_description(
        check.opt_str_param(description, 'description', default=''),
        check.opt_list_param(columns, 'columns', of_type=PandasColumn),
    )

    def _dagster_type_check(_, value):
        if not isinstance(value, pd.DataFrame):
            return TypeCheck(
                success=False,
                description=
                'Must be a pandas.DataFrame. Got value of type. {type_name}'.
                format(type_name=type(value).__name__),
            )

        try:
            validate_constraints(value,
                                 pandas_columns=columns,
                                 dataframe_constraints=dataframe_constraints)
        except ConstraintViolationException as e:
            return TypeCheck(success=False, description=str(e))

        return TypeCheck(
            success=True,
            metadata_entries=_execute_summary_stats(
                name, value, event_metadata_fn) if event_metadata_fn else None,
        )

    return DagsterType(
        name=name,
        type_check_fn=_dagster_type_check,
        input_hydration_config=dataframe_input_schema,
        output_materialization_config=dataframe_output_schema,
        description=description,
    )
Ejemplo n.º 14
0
    def define_type_check_pipeline(should_require_resources):
        @resource
        def resource_a(_):
            yield 'A'

        def resource_based_type_check(context, value):
            return context.resources.a == value

        CustomType = DagsterType(
            name='NeedsA',
            type_check_fn=resource_based_type_check,
            required_resource_keys={'a'} if should_require_resources else None,
        )

        @solid(output_defs=[OutputDefinition(CustomType, 'custom_type')])
        def custom_type_solid(_):
            return 'A'

        @pipeline(mode_defs=[ModeDefinition(resource_defs={'a': resource_a})])
        def type_check_pipeline():
            custom_type_solid()

        return type_check_pipeline
Ejemplo n.º 15
0
    fields = [field for field in value[0].keys()]

    for i in range(len(value)):
        row = value[i]
        if not isinstance(row, dict):
            return False
        row_fields = [field for field in row.keys()]
        if fields != row_fields:
            return False

    return True


LessSimpleDataFrame = DagsterType(
    name='LessSimpleDataFrame',
    description=
    'A more sophisticated data frame that type checks its structure.',
    type_check_fn=less_simple_data_frame_type_check,
)


@solid
def bad_read_csv(context, csv_path: str) -> LessSimpleDataFrame:
    csv_path = os.path.join(os.path.dirname(__file__), csv_path)
    with open(csv_path, 'r') as fd:
        lines = [row for row in csv.DictReader(fd)]

    context.log.info('Read {n_lines} lines'.format(n_lines=len(lines)))
    return ["not_a_dict"]


@solid
import csv

from dagster import (
    DagsterType,
    InputDefinition,
    OutputDefinition,
    String,
    execute_pipeline,
    pipeline,
    solid,
)

SimpleDataFrame = DagsterType(
    name='SimpleDataFrame',
    type_check_fn=lambda _, value: isinstance(value, list),
    description=
    'A naive representation of a data frame, e.g., as returned by csv.DictReader.',
)


@solid(
    input_defs=[InputDefinition('csv_path', String)],
    output_defs=[OutputDefinition(SimpleDataFrame)],
)
def read_csv(context, csv_path: str) -> list:
    with open(csv_path, 'r') as fd:
        lines = [row for row in csv.DictReader(fd)]

    context.log.info('Read {n_lines} lines'.format(n_lines=len(lines)))
    return lines
Ejemplo n.º 17
0
        )

    if timeseries_length != output_vector_length:
        return TypeCheck(
            success=False,
            description=
            "Every timeseries must have as many snapshots as outputs")

    return TypeCheck(
        success=True,
        metadata_entries=[
            EventMetadataEntry.text(str(num_timeseries), "num_ts",
                                    "Number of parallel timeseries."),
            EventMetadataEntry.text(str(timeseries_length),
                                    "timeseries_length",
                                    "Length of each timeseries."),
            EventMetadataEntry.text(
                str(snapshot_length),
                "snapshot_length",
                "Number of past observations for each input.",
            ),
        ],
    )


TrainingSet = DagsterType(
    name="TrainingSet",
    description="Final training set ready for the ml pipeline",
    type_check_fn=validate_snapshot_timeseries,
)
Ejemplo n.º 18
0
def create_dagster_pandas_dataframe_type(
    name,
    description=None,
    columns=None,
    event_metadata_fn=None,
    dataframe_constraints=None,
    input_hydration_config=None,
    output_materialization_config=None,
):
    """
    Constructs a custom pandas dataframe dagster type.

    Args:
        name (str): Name of the dagster pandas type.
        description (Optional[str]): A markdown-formatted string, displayed in tooling.
        columns (Optional[List[PandasColumn]]): A list of :py:class:`~dagster.PandasColumn` objects
            which express dataframe column schemas and constraints.
        event_metadata_fn (Optional[func]): A callable which takes your dataframe and returns a list of EventMetadata
            which allow you to express things like summary statistics during runtime.
        dataframe_constraints (Optional[List[DataFrameConstraint]]): A list of objects that inherit from
            :py:class:`~dagster.DataFrameConstraint`. This allows you to express dataframe-level constraints.
        input_hydration_config (Optional[InputHydrationConfig]): An instance of a class that
            inherits from :py:class:`~dagster.InputHydrationConfig`. If None, we will default
            to using the `dataframe_input_schema` input_hydration_config.
        output_materialization_config (Optional[OutputMaterializationConfig]): An instance of a class
            that inherits from :py:class:`~dagster.OutputMaterializationConfig`. If None, we will
            default to using the `dataframe_output_schema` output_materialization_config.
    """
    # We allow for the plugging in of input_hydration_config/output_materialization_configs so that
    # Users can hydrate and persist their custom dataframes via configuration their own way if the default
    # configs don't suffice. This is purely optional.
    check.str_param(name, 'name')
    event_metadata_fn = check.opt_callable_param(event_metadata_fn,
                                                 'event_metadata_fn')
    description = create_dagster_pandas_dataframe_description(
        check.opt_str_param(description, 'description', default=''),
        check.opt_list_param(columns, 'columns', of_type=PandasColumn),
    )

    def _dagster_type_check(_, value):
        if not isinstance(value, pd.DataFrame):
            return TypeCheck(
                success=False,
                description=
                'Must be a pandas.DataFrame. Got value of type. {type_name}'.
                format(type_name=type(value).__name__),
            )

        try:
            validate_constraints(value,
                                 pandas_columns=columns,
                                 dataframe_constraints=dataframe_constraints)
        except ConstraintViolationException as e:
            return TypeCheck(success=False, description=str(e))

        return TypeCheck(
            success=True,
            metadata_entries=_execute_summary_stats(
                name, value, event_metadata_fn) if event_metadata_fn else None,
        )

    return DagsterType(
        name=name,
        type_check_fn=_dagster_type_check,
        input_hydration_config=input_hydration_config
        if input_hydration_config else dataframe_input_schema,
        output_materialization_config=output_materialization_config
        if output_materialization_config else dataframe_output_schema,
        description=description,
    )
Ejemplo n.º 19
0
        )
    if not safe_isfile(value):
        return TypeCheck(
            success=False,
            description=(
                'FileExistsAtPath must be a path that points to a file that '
                'exists. "{value}" does not exist on disk'
            ).format(value=value),
        )

    return True


FileExistsAtPath = DagsterType(
    name='FileExistsAtPath',
    description='A path at which a file actually exists',
    type_check_fn=file_exists_at_path_type_check,
)


def _download_from_s3_to_file(session, context, bucket, key, target_folder, skip_if_present):
    # TODO: remove context argument once we support resource logging

    # file name is S3 key path suffix after last /
    target_file = os.path.join(target_folder, key.split('/')[-1])

    if skip_if_present and safe_isfile(target_file):
        context.log.info(
            'Skipping download, file already present at {target_file}'.format(
                target_file=target_file
            )
Ejemplo n.º 20
0
def create_structured_dataframe_type(
    name,
    description=None,
    columns_validator=None,
    columns_aggregate_validator=None,
    dataframe_validator=None,
    input_hydration_config=None,
    output_materialization_config=None,
):
    """

    Args:
        name (str): the name of the new type
        description (Optional[str]): the description of the new type
        columns_validator (Optional[Union[ColumnConstraintWithMetadata, MultiColumnConstraintWithMetadata]]):
                    what column-level row by row validation you want to have applied.
                    Leave empty for no column-level row by row validation.
        columns_aggregate_validator (Optional[Union[ColumnAggregateConstraintWithMetadata,
                                    MultiAggregateConstraintWithMetadata]]):
                    what column-level aggregate validation you want to have applied,
                    Leave empty for no column-level aggregate validation.
        dataframe_validator (Optional[Union[ConstraintWithMetadata, MultiConstraintWithMetadata]]):
                    what dataframe-wide validation you want to have applied.
                    Leave empty for no dataframe-wide validation.
        input_hydration_config (Optional[InputHydrationConfig]): An instance of a class that
            inherits from :py:class:`~dagster.InputHydrationConfig`. If None, we will default
            to using the `dataframe_input_schema` input_hydration_config.
        output_materialization_config (Optional[OutputMaterializationConfig]): An instance of a class
            that inherits from :py:class:`~dagster.OutputMaterializationConfig`. If None, we will
            default to using the `dataframe_output_schema` output_materialization_config.

    Returns:
        a DagsterType with the corresponding name and packaged validation.

    """
    def _dagster_type_check(_, value):
        if not isinstance(value, pd.DataFrame):
            return TypeCheck(
                success=False,
                description=
                'Must be a pandas.DataFrame. Got value of type. {type_name}'.
                format(type_name=type(value).__name__),
            )
        individual_result_dict = {}
        if columns_validator is not None:
            individual_result_dict["columns"] = columns_validator.validate(
                value)

        if columns_aggregate_validator is not None:
            individual_result_dict[
                "column aggregates"] = columns_aggregate_validator.validate(
                    value)

        if dataframe_validator is not None:
            individual_result_dict["dataframe"] = dataframe_validator.validate(
                value)

        typechecks_succeeded = True
        metadata = []
        overall_description = ""
        for key, result in individual_result_dict.items():
            result_val = result.success
            if result_val:
                continue
            typechecks_succeeded = typechecks_succeeded and result_val
            result_dict = result.metadata_entries[0].entry_data.data
            metadata.append(
                EventMetadataEntry.json(
                    result_dict,
                    '{}-constraint-metadata'.format(key),
                ))
            overall_description += "{} failing constraints, requiring {}".format(
                key, result.description)
        return TypeCheck(success=typechecks_succeeded,
                         description=overall_description,
                         metadata_entries=metadata)

    description = check.opt_str_param(description, 'description', default='')
    return DagsterType(
        name=name,
        type_check_fn=_dagster_type_check,
        input_hydration_config=input_hydration_config
        if input_hydration_config else dataframe_input_schema,
        output_materialization_config=output_materialization_config
        if output_materialization_config else dataframe_output_schema,
        description=description,
    )
Ejemplo n.º 21
0
        success=True,
        metadata_entries=[
            EventMetadataEntry.text(str(len(value)), 'row_count',
                                    'Number of rows in DataFrame'),
            # string cast columns since they may be things like datetime
            EventMetadataEntry.json({'columns': list(map(str, value.columns))},
                                    'metadata'),
        ],
    )


DataFrame = DagsterType(
    name='PandasDataFrame',
    description='''Two-dimensional size-mutable, potentially heterogeneous
    tabular data structure with labeled axes (rows and columns).
    See http://pandas.pydata.org/''',
    input_hydration_config=dataframe_input_schema,
    output_materialization_config=dataframe_output_schema,
    type_check_fn=df_type_check,
)


def _construct_constraint_list(constraints):
    def add_bullet(constraint_list, constraint_description):
        return constraint_list + "+ {constraint_description}\n".format(
            constraint_description=constraint_description)

    constraint_list = ""
    for constraint in constraints:
        if constraint.__class__ not in CONSTRAINT_BLACKLIST:
            constraint_list = add_bullet(constraint_list,
Ejemplo n.º 22
0
@input_hydration_config(Selector({"csv": Field(String)}))
def less_simple_data_frame_input_hydration_config(context, selector):
    lines = []
    with open(selector["csv"], "r") as fd:
        for row in csv.DictReader(fd):
            row["calories"] = int(row["calories"])
            lines.append(row)

    context.log.info("Read {n_lines} lines".format(n_lines=len(lines)))
    return lines


LessSimpleDataFrame = DagsterType(
    name="LessSimpleDataFrame",
    description=
    "A more sophisticated data frame that type checks its structure.",
    type_check_fn=less_simple_data_frame_type_check,
    input_hydration_config=less_simple_data_frame_input_hydration_config,
)


def expect_column_to_be_integers(data_frame: LessSimpleDataFrame,
                                 column_name: str) -> ExpectationResult:
    bad_values = []
    for idx in range(len(data_frame)):
        line = data_frame[idx]
        if not isinstance(line[column_name], int):
            bad_values.append((idx, str(line[column_name])))
    return ExpectationResult(
        success=(not bad_values),
        label="col_{column_name}_is_int".format(column_name=column_name),
Ejemplo n.º 23
0
    OutputDefinition,
    execute_pipeline,
    pipeline,
    solid,
)


# start_custom_types_2_marker_0
def is_list_of_dicts(_, value):
    return isinstance(value, list) and all(
        isinstance(element, dict) for element in value)


SimpleDataFrame = DagsterType(
    name="SimpleDataFrame",
    type_check_fn=is_list_of_dicts,
    description=
    "A naive representation of a data frame, e.g., as returned by csv.DictReader.",
)
# end_custom_types_2_marker_0


# start_custom_types_2_marker_1
@solid(output_defs=[OutputDefinition(SimpleDataFrame)])
def bad_read_csv(context):
    csv_path = os.path.join(os.path.dirname(__file__), "cereal.csv")
    with open(csv_path, "r") as fd:
        lines = [row for row in csv.DictReader(fd)]

    context.log.info(f"Read {len(lines)} lines")
    return ["not_a_dict"]
Ejemplo n.º 24
0
        success=True,
        metadata_entries=[
            EventMetadataEntry.text(str(len(value)), "row_count",
                                    "Number of rows in DataFrame"),
            # string cast columns since they may be things like datetime
            EventMetadataEntry.json({"columns": list(map(str, value.columns))},
                                    "metadata"),
        ],
    )


DataFrame = DagsterType(
    name="PandasDataFrame",
    description="""Two-dimensional size-mutable, potentially heterogeneous
    tabular data structure with labeled axes (rows and columns).
    See http://pandas.pydata.org/""",
    loader=dataframe_loader,
    materializer=dataframe_materializer,
    type_check_fn=df_type_check,
)


def _construct_constraint_list(constraints):
    def add_bullet(constraint_list, constraint_description):
        return constraint_list + "+ {constraint_description}\n".format(
            constraint_description=constraint_description)

    constraint_list = ""
    for constraint in constraints:
        if constraint.__class__ not in CONSTRAINT_BLACKLIST:
            constraint_list = add_bullet(constraint_list,
Ejemplo n.º 25
0
def less_simple_data_frame_loader(context, selector):
    csv_path = os.path.join(os.path.dirname(__file__), selector["csv"])
    with open(csv_path, "r") as fd:
        lines = [row for row in csv.DictReader(fd)]

    context.log.info("Read {n_lines} lines".format(n_lines=len(lines)))
    return lines


# end_custom_types_3_marker_0

# start_custom_types_3_marker_1
LessSimpleDataFrame = DagsterType(
    name="LessSimpleDataFrame",
    description=
    "A more sophisticated data frame that type checks its structure.",
    type_check_fn=less_simple_data_frame_type_check,
    loader=less_simple_data_frame_loader,
)
# end_custom_types_3_marker_1


@solid
def sort_by_calories(context, cereals: LessSimpleDataFrame):
    sorted_cereals = sorted(cereals, key=lambda cereal: cereal["calories"])
    context.log.info("Least caloric cereal: {least_caloric}".format(
        least_caloric=sorted_cereals[0]["name"]))
    context.log.info("Most caloric cereal: {most_caloric}".format(
        most_caloric=sorted_cereals[-1]["name"]))

Ejemplo n.º 26
0
            to_function(dask_df, *to_args, **to_kwargs)

        if to_path:
            yield AssetMaterialization.file(to_path)


def df_type_check(_, value):
    if not isinstance(value, dd.DataFrame):
        return TypeCheck(success=False)
    return TypeCheck(
        success=True,
        metadata_entries=[
            # string cast columns since they may be things like datetime
            EventMetadataEntry.json({"columns": list(map(str, value.columns))},
                                    "metadata"),
        ],
    )


DataFrame = DagsterType(
    name="DaskDataFrame",
    description=
    """A Dask DataFrame is a large parallel DataFrame composed of many smaller Pandas DataFrames, split along the index.
    These Pandas DataFrames may live on disk for larger-than-memory computing on a single machine, or on many different machines in a cluster.
    One Dask DataFrame operation triggers many operations on the constituent Pandas DataFrames.
    See https://docs.dask.org/en/latest/dataframe.html""",
    loader=dataframe_loader,
    materializer=dataframe_materializer,
    type_check_fn=df_type_check,
)
Ejemplo n.º 27
0
    assert result.success

    result = basic.to_job(config={
        "ops": {
            "my_op": {
                "config": {
                    "conf_str": "foo"
                }
            }
        }
    }).execute_in_process()
    assert result.success


even_type = DagsterType(
    name="EvenDagsterType",
    type_check_fn=lambda _, value: isinstance(value, int) and value % 2 == 0,
)


# Test typing override between out and annotation. Should they just match?
def test_out_dagster_type():
    @op(out=Out(dagster_type=even_type))
    def basic() -> int:
        return 6

    assert basic.output_defs[0].dagster_type == even_type
    assert basic() == 6


def test_multiout_dagster_type():
    @op(out={
Ejemplo n.º 28
0
# end_configured_op_marker

# start_input_op_marker


@op
def my_input_op(abc, xyz):
    pass


# end_input_op_marker

# start_typed_input_op_marker

MyDagsterType = DagsterType(type_check_fn=lambda _, value: value % 2 == 0,
                            name="MyDagsterType")


@op(ins={"abc": In(dagster_type=MyDagsterType)})
def my_typed_input_op(abc):
    pass


# end_typed_input_op_marker

# start_output_op_marker


@op
def my_output_op():
    return 5
Ejemplo n.º 29
0
def create_dagster_pandas_dataframe_type(
    name,
    description=None,
    columns=None,
    event_metadata_fn=None,
    dataframe_constraints=None,
    loader=None,
    materializer=None,
):
    """
    Constructs a custom pandas dataframe dagster type.

    Args:
        name (str): Name of the dagster pandas type.
        description (Optional[str]): A markdown-formatted string, displayed in tooling.
        columns (Optional[List[PandasColumn]]): A list of :py:class:`~dagster.PandasColumn` objects
            which express dataframe column schemas and constraints.
        event_metadata_fn (Optional[func]): A callable which takes your dataframe and returns a list of EventMetadata
            which allow you to express things like summary statistics during runtime.
        dataframe_constraints (Optional[List[DataFrameConstraint]]): A list of objects that inherit from
            :py:class:`~dagster.DataFrameConstraint`. This allows you to express dataframe-level constraints.
        loader (Optional[DagsterTypeLoader]): An instance of a class that
            inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default
            to using `dataframe_loader`.
        materializer (Optional[DagsterTypeMaterializer]): An instance of a class
            that inherits from :py:class:`~dagster.DagsterTypeMaterializer`. If None, we will
            default to using `dataframe_materializer`.
    """
    # We allow for the plugging in of dagster_type_loaders/materializers so that
    # Users can load and materialize their custom dataframes via configuration their own way if the default
    # configs don't suffice. This is purely optional.
    check.str_param(name, "name")
    event_metadata_fn = check.opt_callable_param(event_metadata_fn,
                                                 "event_metadata_fn")
    description = create_dagster_pandas_dataframe_description(
        check.opt_str_param(description, "description", default=""),
        check.opt_list_param(columns, "columns", of_type=PandasColumn),
    )

    def _dagster_type_check(_, value):
        if not isinstance(value, pd.DataFrame):
            return TypeCheck(
                success=False,
                description=
                "Must be a pandas.DataFrame. Got value of type. {type_name}".
                format(type_name=type(value).__name__),
            )

        try:
            validate_constraints(value,
                                 pandas_columns=columns,
                                 dataframe_constraints=dataframe_constraints)
        except ConstraintViolationException as e:
            return TypeCheck(success=False, description=str(e))

        return TypeCheck(
            success=True,
            metadata_entries=_execute_summary_stats(
                name, value, event_metadata_fn) if event_metadata_fn else None,
        )

    return DagsterType(
        name=name,
        type_check_fn=_dagster_type_check,
        loader=loader if loader else dataframe_loader,
        materializer=materializer if loader else dataframe_materializer,
        description=description,
    )
Ejemplo n.º 30
0
def create_structured_dataframe_type(
    name,
    description=None,
    columns_validator=None,
    columns_aggregate_validator=None,
    dataframe_validator=None,
    loader=None,
    materializer=None,
):
    """

    Args:
        name (str): the name of the new type
        description (Optional[str]): the description of the new type
        columns_validator (Optional[Union[ColumnConstraintWithMetadata, MultiColumnConstraintWithMetadata]]):
                    what column-level row by row validation you want to have applied.
                    Leave empty for no column-level row by row validation.
        columns_aggregate_validator (Optional[Union[ColumnAggregateConstraintWithMetadata,
                                    MultiAggregateConstraintWithMetadata]]):
                    what column-level aggregate validation you want to have applied,
                    Leave empty for no column-level aggregate validation.
        dataframe_validator (Optional[Union[ConstraintWithMetadata, MultiConstraintWithMetadata]]):
                    what dataframe-wide validation you want to have applied.
                    Leave empty for no dataframe-wide validation.
        loader (Optional[DagsterTypeLoader]): An instance of a class that
            inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default
            to using `dataframe_loader`.
        materializer (Optional[DagsterTypeMaterializer]): An instance of a class
            that inherits from :py:class:`~dagster.DagsterTypeMaterializer`. If None, we will
            default to using `dataframe_materializer`.

    Returns:
        a DagsterType with the corresponding name and packaged validation.

    """
    def _dagster_type_check(_, value):
        if not isinstance(value, pd.DataFrame):
            return TypeCheck(
                success=False,
                description=
                "Must be a pandas.DataFrame. Got value of type. {type_name}".
                format(type_name=type(value).__name__),
            )
        individual_result_dict = {}

        if dataframe_validator is not None:
            individual_result_dict["dataframe"] = dataframe_validator.validate(
                value)
        if columns_validator is not None:
            individual_result_dict["columns"] = columns_validator.validate(
                value)

        if columns_aggregate_validator is not None:
            individual_result_dict[
                "column-aggregates"] = columns_aggregate_validator.validate(
                    value)

        typechecks_succeeded = True
        metadata = []
        overall_description = "Failed Constraints: {}"
        constraint_clauses = []
        for key, result in individual_result_dict.items():
            result_val = result.success
            if result_val:
                continue
            typechecks_succeeded = typechecks_succeeded and result_val
            result_dict = result.metadata_entries[0].entry_data.data
            metadata.append(
                EventMetadataEntry.json(
                    result_dict,
                    "{}-constraint-metadata".format(key),
                ))
            constraint_clauses.append("{} failing constraints, {}".format(
                key, result.description))
        # returns aggregates, then column, then dataframe
        return TypeCheck(
            success=typechecks_succeeded,
            description=overall_description.format(constraint_clauses),
            metadata_entries=sorted(metadata, key=lambda x: x.label),
        )

    description = check.opt_str_param(description, "description", default="")
    return DagsterType(
        name=name,
        type_check_fn=_dagster_type_check,
        loader=loader if loader else dataframe_loader,
        materializer=materializer if loader else dataframe_materializer,
        description=description,
    )