Beispiel #1
0
def test_column_type_constraint():
    test_dataframe = DataFrame({"foo": ["baz"]})
    assert ColumnDTypeInSetConstraint({"object"}).validate(
        test_dataframe, "foo") is None

    with pytest.raises(ConstraintViolationException):
        ColumnDTypeInSetConstraint({"int64"}).validate(test_dataframe, "foo")
Beispiel #2
0
    def datetime_column(
        name,
        min_datetime=Timestamp.min,
        max_datetime=Timestamp.max,
        non_nullable=False,
        unique=False,
        ignore_missing_vals=False,
        is_required=None,
        tz=None,
    ):
        """
        Simple constructor for PandasColumns that expresses datetime constraints on 'datetime64[ns]' dtypes.

        Args:
            name (str): Name of the column. This must match up with the column name in the dataframe you
                expect to receive.
            min_datetime (Optional[Union[int,float]]): The lower bound for values you expect in this column.
                Defaults to pandas.Timestamp.min.
            max_datetime (Optional[Union[int,float]]): The upper bound for values you expect in this column.
                Defaults to pandas.Timestamp.max.
            non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column
                ought to be non null values.
            unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.
            ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will
                only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.
            is_required (Optional[bool]): Flag indicating the optional/required presence of the column.
                If the column exists the validate function will validate the column. Default to True.
            tz (Optional[str]): Required timezone for values eg: tz='UTC', tz='Europe/Dublin', tz='US/Eastern'.
                Defaults to None, meaning naive datetime values.
        """
        if tz is None:
            datetime_constraint = ColumnDTypeInSetConstraint({"datetime64[ns]"})
        else:
            datetime_constraint = ColumnDTypeInSetConstraint({f"datetime64[ns, {tz}]"})
            # One day more/less than absolute min/max to prevent OutOfBoundsDatetime errors when converting min/max to be tz aware
            if min_datetime.replace(tzinfo=None) == Timestamp.min:
                min_datetime = Timestamp("1677-09-22 00:12:43.145225Z")
            if max_datetime.replace(tzinfo=None) == Timestamp.max:
                max_datetime = Timestamp("2262-04-10 23:47:16.854775807Z")
            # Convert bounds to same tz
            if Timestamp(min_datetime).tz is None:
                min_datetime = Timestamp(min_datetime).tz_localize(tz)
            if Timestamp(max_datetime).tz is None:
                max_datetime = Timestamp(max_datetime).tz_localize(tz)

        return PandasColumn(
            name=check.str_param(name, "name"),
            constraints=[
                datetime_constraint,
                InRangeColumnConstraint(
                    min_datetime, max_datetime, ignore_missing_vals=ignore_missing_vals
                ),
            ]
            + _construct_keyword_constraints(
                non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals
            ),
            is_required=is_required,
        )
Beispiel #3
0
def test_validate_constraints_ok():
    column_constraints = [
        PandasColumn(name='foo',
                     constraints=[ColumnDTypeInSetConstraint({'object'})]),
    ]
    dataframe = DataFrame({'foo': ['bar', 'baz']})
    assert validate_constraints(dataframe,
                                pandas_columns=column_constraints) is None
Beispiel #4
0
def test_missing_column_validation_with_optional_column():
    column_constraints = [
        PandasColumn(
            name="qux", constraints=[ColumnDTypeInSetConstraint({"object"})], is_required=False
        ),
    ]
    dataframe = DataFrame({"foo": ["bar", "baz"]})
    assert validate_constraints(dataframe, pandas_columns=column_constraints) is None
Beispiel #5
0
def test_missing_column_validation():
    column_constraints = [
        PandasColumn(name="qux", constraints=[ColumnDTypeInSetConstraint({"object"})]),
    ]
    dataframe = DataFrame({"foo": ["bar", "baz"]})
    with pytest.raises(
        ConstraintViolationException, match="Required column qux not in dataframe with columns"
    ):
        validate_constraints(dataframe, pandas_columns=column_constraints)
Beispiel #6
0
def test_missing_column_validation_with_optional_column():
    column_constraints = [
        PandasColumn(name='qux',
                     constraints=[ColumnDTypeInSetConstraint({'object'})],
                     is_required=False),
    ]
    dataframe = DataFrame({'foo': ['bar', 'baz']})
    assert validate_constraints(dataframe,
                                pandas_columns=column_constraints) is None
Beispiel #7
0
def test_dataframe_description_generation_just_type_constraint():
    TestDataFrame = create_dagster_pandas_dataframe_type(
        name="TestDataFrame",
        columns=[
            PandasColumn(name="foo",
                         constraints=[ColumnDTypeInSetConstraint({"int64"})])
        ],
    )
    assert TestDataFrame.description == "\n### Columns\n**foo**: `int64`\n\n"
Beispiel #8
0
def test_create_pandas_dataframe_dagster_type():
    TestDataFrame = create_dagster_pandas_dataframe_type(
        name="TestDataFrame",
        columns=[
            PandasColumn(name="foo",
                         constraints=[ColumnDTypeInSetConstraint({"int64"})])
        ],
    )
    assert isinstance(TestDataFrame, DagsterType)
Beispiel #9
0
def test_missing_column_validation():
    column_constraints = [
        PandasColumn(name='qux',
                     constraints=[ColumnDTypeInSetConstraint({'object'})]),
    ]
    dataframe = DataFrame({'foo': ['bar', 'baz']})
    with pytest.raises(
            ConstraintViolationException,
            match="Required column qux not in dataframe with columns"):
        validate_constraints(dataframe, pandas_columns=column_constraints)
Beispiel #10
0
def test_dataframe_description_generation_multi_constraints():
    TestDataFrame = create_dagster_pandas_dataframe_type(
        name="TestDataFrame",
        columns=[
            PandasColumn(
                name="foo",
                constraints=[
                    ColumnDTypeInSetConstraint({"int64"}),
                    InRangeColumnConstraint(0, 100, ignore_missing_vals=False),
                    NonNullableColumnConstraint(),
                ],
            ),
        ],
    )
    assert (
        TestDataFrame.description ==
        "\n### Columns\n**foo**: `int64`\n+ 0 < values < 100\n+ No Null values allowed.\n\n"
    )
Beispiel #11
0
    def datetime_column(
        name,
        min_datetime=Timestamp.min,
        max_datetime=Timestamp.max,
        non_nullable=False,
        unique=False,
        ignore_missing_vals=False,
        is_required=None,
    ):
        """
        Simple constructor for PandasColumns that expresses datetime constraints on 'datetime64[ns]' dtypes.

        Args:
            name (str): Name of the column. This must match up with the column name in the dataframe you
                expect to receive.
            min_datetime (Optional[Union[int,float]]): The lower bound for values you expect in this column.
                Defaults to pandas.Timestamp.min.
            max_datetime (Optional[Union[int,float]]): The upper bound for values you expect in this column.
                Defaults to pandas.Timestamp.max.
            non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column
                ought to be non null values.
            unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.
            ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will
                only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.
            is_required (Optional[bool]): Flag indicating the optional/required presence of the column.
                If the column exists the validate function will validate the column. Default to True.
        """
        return PandasColumn(
            name=check.str_param(name, "name"),
            constraints=[
                ColumnDTypeInSetConstraint({"datetime64[ns]"}),
                InRangeColumnConstraint(
                    min_datetime,
                    max_datetime,
                    ignore_missing_vals=ignore_missing_vals),
            ] + _construct_keyword_constraints(
                non_nullable=non_nullable,
                unique=unique,
                ignore_missing_vals=ignore_missing_vals),
            is_required=is_required,
        )
Beispiel #12
0
    def categorical_column(
        name,
        categories,
        of_types=frozenset({"category", "object"}),
        non_nullable=False,
        unique=False,
        ignore_missing_vals=False,
        is_required=None,
    ):
        """
        Simple constructor for PandasColumns that expresses categorical constraints on specified dtypes.

        Args:
            name (str): Name of the column. This must match up with the column name in the dataframe you
                expect to receive.
            categories (List[Any]): The valid set of buckets that all values in the column must match.
            of_types (Optional[Union[str, Set[str]]]): The expected dtype[s] that your categories and values must
                abide by.
            non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in
                the column ought to be non null values.
            unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.
            ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the
                constraint will only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.
            is_required (Optional[bool]): Flag indicating the optional/required presence of the column.
                If the column exists the validate function will validate the column. Default to True.
        """
        of_types = {of_types} if isinstance(of_types, str) else of_types
        return PandasColumn(
            name=check.str_param(name, "name"),
            constraints=[
                ColumnDTypeInSetConstraint(of_types),
                CategoricalColumnConstraint(
                    categories, ignore_missing_vals=ignore_missing_vals),
            ] + _construct_keyword_constraints(
                non_nullable=non_nullable,
                unique=unique,
                ignore_missing_vals=ignore_missing_vals),
            is_required=is_required,
        )
Beispiel #13
0
def test_column_type_constraint():
    test_dataframe = DataFrame({'foo': ['baz']})
    assert ColumnDTypeInSetConstraint({'object'}).validate(test_dataframe, 'foo') is None

    with pytest.raises(ConstraintViolationException):
        ColumnDTypeInSetConstraint({'int64'}).validate(test_dataframe, 'foo')
Beispiel #14
0
def test_missing_column_validation_with_optional_column():
    column_constraints = [
        PandasColumn(
            name="qux", constraints=[ColumnDTypeInSetConstraint({"object"})], is_required=False
        ),
    ]
    dataframe = DataFrame({"foo": ["bar", "baz"]})
    assert validate_constraints(dataframe, pandas_columns=column_constraints) is None


@pytest.mark.parametrize(
    "column_constraints, dataframe",
    [
        (
            [PandasColumn(name="foo", constraints=[ColumnDTypeInSetConstraint({"int64"})])],
            DataFrame({"foo": ["bar", "baz"]}),
        ),
        (
            [PandasColumn(name="foo", constraints=[ColumnDTypeInSetConstraint({"object"})])],
            DataFrame({"bar": ["bar", "baz"]}),
        ),
    ],
)
def test_validate_constraints_throw_error(column_constraints, dataframe):
    with pytest.raises(ConstraintViolationException):
        validate_constraints(dataframe, pandas_columns=column_constraints)


def test_shape_validation_ok():
    assert (
Beispiel #15
0
                     constraints=[ColumnDTypeInSetConstraint({'object'})],
                     is_required=False),
    ]
    dataframe = DataFrame({'foo': ['bar', 'baz']})
    assert validate_constraints(dataframe,
                                pandas_columns=column_constraints) is None


@pytest.mark.parametrize(
    'column_constraints, dataframe',
    [
        (
            [
                PandasColumn(
                    name='foo',
                    constraints=[ColumnDTypeInSetConstraint({'int64'})])
            ],
            DataFrame({'foo': ['bar', 'baz']}),
        ),
        (
            [
                PandasColumn(
                    name='foo',
                    constraints=[ColumnDTypeInSetConstraint({'object'})])
            ],
            DataFrame({'bar': ['bar', 'baz']}),
        ),
    ],
)
def test_validate_constraints_throw_error(column_constraints, dataframe):
    with pytest.raises(ConstraintViolationException):
Beispiel #16
0
def test_validate_constraints_ok():
    column_constraints = [
        PandasColumn(name="foo", constraints=[ColumnDTypeInSetConstraint({"object"})]),
    ]
    dataframe = DataFrame({"foo": ["bar", "baz"]})
    assert validate_constraints(dataframe, pandas_columns=column_constraints) is None
        if not rows_with_unexpected_buckets.empty:
            raise ColumnConstraintViolationException(
                constraint_name=self.name,
                constraint_description=self.error_description,
                column_name=column_name,
                offending_rows=rows_with_unexpected_buckets,
            )


CustomTripDataFrame = create_dagster_pandas_dataframe_type(
    name="CustomTripDataFrame",
    columns=[
        PandasColumn(
            "amount_paid",
            constraints=[
                ColumnDTypeInSetConstraint({"int64"}),
                DivisibleByFiveConstraint()
            ],
        )
    ],
)
# end_custom_col


@solid(
    output_defs=[
        OutputDefinition(name="custom_trip_dataframe",
                         dagster_type=CustomTripDataFrame)
    ], )
def load_custom_trip_dataframe(_) -> DataFrame:
    return read_csv(
        rows_with_unexpected_buckets = dataframe[dataframe[column_name].apply(lambda x: x % 5 != 0)]
        if not rows_with_unexpected_buckets.empty:
            raise ColumnConstraintViolationException(
                constraint_name=self.name,
                constraint_description=self.error_description,
                column_name=column_name,
                offending_rows=rows_with_unexpected_buckets,
            )


CustomTripDataFrame = create_dagster_pandas_dataframe_type(
    name='CustomTripDataFrame',
    columns=[
        PandasColumn(
            'amount_paid',
            constraints=[ColumnDTypeInSetConstraint({'int64'}), DivisibleByFiveConstraint()],
        )
    ],
)


@solid(
    output_defs=[OutputDefinition(name='custom_trip_dataframe', dagster_type=CustomTripDataFrame)],
)
def load_custom_trip_dataframe(_) -> DataFrame:
    return read_csv(
        script_relative_path('./ebike_trips.csv'),
        parse_dates=['start_time', 'end_time'],
        date_parser=lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'),
    )