Esempio n. 1
0
def test_column_type_constraint():
    test_dataframe = DataFrame({'foo': ['baz']})
    assert ColumnTypeConstraint('object').validate(test_dataframe,
                                                   'foo') is None

    with pytest.raises(ConstraintViolationException):
        ColumnTypeConstraint('int64').validate(test_dataframe, 'foo')
Esempio n. 2
0
def test_validate_constraints_ok():
    column_constraints = [
        PandasColumn(name='foo', constraints=[ColumnTypeConstraint('object')]),
    ]
    dataframe = DataFrame({'foo': ['bar', 'baz']})
    assert validate_constraints(dataframe,
                                pandas_columns=column_constraints) is None
Esempio n. 3
0
 def boolean_column(name, non_nullable=False, unique=False):
     return PandasColumn(
         name=check.str_param(name, 'name'),
         constraints=[ColumnTypeConstraint('bool')] +
         _construct_keyword_constraints(non_nullable=non_nullable,
                                        unique=unique),
     )
Esempio n. 4
0
 def string_column(cls, name, exists=False, unique=False):
     return cls(
         name=check.str_param(name, 'name'),
         constraints=cls.add_configurable_constraints(
             [ColumnTypeConstraint('object')], exists=exists,
             unique=unique),
     )
Esempio n. 5
0
 def boolean_column(cls, name, exists=False, unique=False):
     return cls(
         name=check.str_param(name, 'name'),
         constraints=cls.add_configurable_constraints(
             [ColumnTypeConstraint('bool')],
             exists=exists,
             unique=unique,
         ),
     )
def test_create_pandas_dataframe_dagster_type():
    TestDataFrame = create_dagster_pandas_dataframe_type(
        name='TestDataFrame',
        columns=[
            PandasColumn(name='foo',
                         constraints=[ColumnTypeConstraint('int64')])
        ],
    )
    assert isinstance(TestDataFrame, RuntimeType)
def test_dataframe_description_generation_just_type_constraint():
    TestDataFrame = create_dagster_pandas_dataframe_type(
        name='TestDataFrame',
        columns=[
            PandasColumn(name='foo',
                         constraints=[ColumnTypeConstraint('int64')])
        ],
    )
    assert TestDataFrame.description == "\n### Columns\n**foo**: `int64`\n\n"
Esempio n. 8
0
 def categorical_column(name,
                        categories,
                        of_types='object',
                        non_nullable=False,
                        unique=False):
     return PandasColumn(
         name=check.str_param(name, 'name'),
         constraints=[
             ColumnTypeConstraint(of_types),
             CategoricalColumnConstraint(categories)
         ] + _construct_keyword_constraints(non_nullable=non_nullable,
                                            unique=unique),
     )
Esempio n. 9
0
 def datetime_column(
     name,
     min_datetime=Timestamp.min,
     max_datetime=Timestamp.max,
     non_nullable=False,
     unique=False,
 ):
     return PandasColumn(
         name=check.str_param(name, 'name'),
         constraints=[
             ColumnTypeConstraint({'datetime64[ns]'}),
             InRangeColumnConstraint(min_datetime, max_datetime),
         ] + _construct_keyword_constraints(non_nullable=non_nullable,
                                            unique=unique),
     )
Esempio n. 10
0
 def categorical_column(cls,
                        name,
                        categories,
                        of_types='object',
                        exists=False,
                        unique=False):
     return cls(
         name=check.str_param(name, 'name'),
         constraints=cls.add_configurable_constraints(
             [
                 ColumnTypeConstraint(of_types),
                 CategoricalColumnConstraint(categories)
             ],
             exists=exists,
             unique=unique,
         ),
     )
def test_dataframe_description_generation_multi_constraints():
    TestDataFrame = create_dagster_pandas_dataframe_type(
        name='TestDataFrame',
        columns=[
            PandasColumn(
                name='foo',
                constraints=[
                    ColumnTypeConstraint('int64'),
                    InRangeColumnConstraint(0, 100),
                    NonNullableColumnConstraint(),
                ],
            ),
        ],
    )
    assert (
        TestDataFrame.description ==
        "\n### Columns\n**foo**: `int64`\n+ 0 < values < 100\n+ No Null values allowed.\n\n"
    )
Esempio n. 12
0
 def datetime_column(
     cls,
     name,
     min_datetime=Timestamp.min,
     max_datetime=Timestamp.max,
     exists=False,
     unique=False,
 ):
     return cls(
         name=check.str_param(name, 'name'),
         constraints=cls.add_configurable_constraints(
             [
                 ColumnTypeConstraint({'datetime64[ns]'}),
                 InRangeColumnConstraint(min_datetime, max_datetime),
             ],
             exists=exists,
             unique=unique,
         ),
     )
Esempio n. 13
0
 def numeric_column(
         name,
         expected_dtypes,
         min_value=-float('inf'),
         max_value=float('inf'),
         non_nullable=False,
         unique=False,
 ):
     return PandasColumn(
         name=check.str_param(name, 'name'),
         constraints=[
             ColumnTypeConstraint(expected_dtypes),
             InRangeColumnConstraint(
                 check.numeric_param(min_value, 'min_value'),
                 check.numeric_param(max_value, 'max_value'),
             ),
         ] + _construct_keyword_constraints(non_nullable=non_nullable,
                                            unique=unique),
     )
Esempio n. 14
0
 def numeric_column(
         cls,
         name,
         expected_dtypes,
         min_value=-float('inf'),
         max_value=float('inf'),
         exists=False,
         unique=False,
 ):
     return cls(
         name=check.str_param(name, 'name'),
         constraints=cls.add_configurable_constraints(
             [
                 ColumnTypeConstraint(expected_dtypes),
                 InRangeColumnConstraint(
                     check.numeric_param(min_value, 'min_value'),
                     check.numeric_param(max_value, 'max_value'),
                 ),
             ],
             exists=exists,
             unique=unique,
         ),
     )
Esempio n. 15
0
def test_validate_collection_schema_ok():
    collection_schema = [
        PandasColumn(name='foo', constraints=[ColumnTypeConstraint('object')]),
    ]
    dataframe = DataFrame({'foo': ['bar', 'baz']})
    assert validate_collection_schema(collection_schema, dataframe) is None
Esempio n. 16
0
def test_validate_collection_schema_ok():
    collection_schema = [
        PandasColumn(name='foo', constraints=[ColumnTypeConstraint('object')]),
    ]
    dataframe = DataFrame({'foo': ['bar', 'baz']})
    assert validate_collection_schema(collection_schema, dataframe) is None


@pytest.mark.parametrize(
    'collection_schema, dataframe',
    [
        (
            [
                PandasColumn(name='foo',
                             constraints=[ColumnTypeConstraint('int64')])
            ],
            DataFrame({'foo': ['bar', 'baz']}),
        ),
        (
            [
                PandasColumn(name='foo',
                             constraints=[ColumnTypeConstraint('object')])
            ],
            DataFrame({'bar': ['bar', 'baz']}),
        ),
    ],
)
def test_validate_collection_schema_throw_error(collection_schema, dataframe):
    with pytest.raises(ConstraintViolationException):
        validate_collection_schema(collection_schema, dataframe)
Esempio n. 17
0
            lambda x: x % 5 != 0)]
        if not rows_with_unexpected_buckets.empty:
            raise ColumnConstraintViolationException(
                constraint_name=self.name,
                constraint_description=self.error_description,
                column_name=column_name,
                offending_rows=rows_with_unexpected_buckets,
            )


CustomTripDataFrame = create_dagster_pandas_dataframe_type(
    name='CustomTripDataFrame',
    columns=[
        PandasColumn('amount_paid',
                     constraints=[
                         ColumnTypeConstraint('int64'),
                         DivisibleByFiveConstraint()
                     ])
    ],
)


@solid(
    output_defs=[
        OutputDefinition(name='custom_trip_dataframe',
                         dagster_type=CustomTripDataFrame)
    ], )
def load_custom_trip_dataframe(_) -> DataFrame:
    return read_csv(
        script_relative_path('./ebike_trips.csv'),
        parse_dates=['start_time', 'end_time'],