def test_column_type_constraint(): test_dataframe = DataFrame({'foo': ['baz']}) assert ColumnTypeConstraint('object').validate(test_dataframe, 'foo') is None with pytest.raises(ConstraintViolationException): ColumnTypeConstraint('int64').validate(test_dataframe, 'foo')
def test_validate_constraints_ok(): column_constraints = [ PandasColumn(name='foo', constraints=[ColumnTypeConstraint('object')]), ] dataframe = DataFrame({'foo': ['bar', 'baz']}) assert validate_constraints(dataframe, pandas_columns=column_constraints) is None
def boolean_column(name, non_nullable=False, unique=False): return PandasColumn( name=check.str_param(name, 'name'), constraints=[ColumnTypeConstraint('bool')] + _construct_keyword_constraints(non_nullable=non_nullable, unique=unique), )
def string_column(cls, name, exists=False, unique=False): return cls( name=check.str_param(name, 'name'), constraints=cls.add_configurable_constraints( [ColumnTypeConstraint('object')], exists=exists, unique=unique), )
def boolean_column(cls, name, exists=False, unique=False): return cls( name=check.str_param(name, 'name'), constraints=cls.add_configurable_constraints( [ColumnTypeConstraint('bool')], exists=exists, unique=unique, ), )
def test_create_pandas_dataframe_dagster_type(): TestDataFrame = create_dagster_pandas_dataframe_type( name='TestDataFrame', columns=[ PandasColumn(name='foo', constraints=[ColumnTypeConstraint('int64')]) ], ) assert isinstance(TestDataFrame, RuntimeType)
def test_dataframe_description_generation_just_type_constraint(): TestDataFrame = create_dagster_pandas_dataframe_type( name='TestDataFrame', columns=[ PandasColumn(name='foo', constraints=[ColumnTypeConstraint('int64')]) ], ) assert TestDataFrame.description == "\n### Columns\n**foo**: `int64`\n\n"
def categorical_column(name, categories, of_types='object', non_nullable=False, unique=False): return PandasColumn( name=check.str_param(name, 'name'), constraints=[ ColumnTypeConstraint(of_types), CategoricalColumnConstraint(categories) ] + _construct_keyword_constraints(non_nullable=non_nullable, unique=unique), )
def datetime_column( name, min_datetime=Timestamp.min, max_datetime=Timestamp.max, non_nullable=False, unique=False, ): return PandasColumn( name=check.str_param(name, 'name'), constraints=[ ColumnTypeConstraint({'datetime64[ns]'}), InRangeColumnConstraint(min_datetime, max_datetime), ] + _construct_keyword_constraints(non_nullable=non_nullable, unique=unique), )
def categorical_column(cls, name, categories, of_types='object', exists=False, unique=False): return cls( name=check.str_param(name, 'name'), constraints=cls.add_configurable_constraints( [ ColumnTypeConstraint(of_types), CategoricalColumnConstraint(categories) ], exists=exists, unique=unique, ), )
def test_dataframe_description_generation_multi_constraints(): TestDataFrame = create_dagster_pandas_dataframe_type( name='TestDataFrame', columns=[ PandasColumn( name='foo', constraints=[ ColumnTypeConstraint('int64'), InRangeColumnConstraint(0, 100), NonNullableColumnConstraint(), ], ), ], ) assert ( TestDataFrame.description == "\n### Columns\n**foo**: `int64`\n+ 0 < values < 100\n+ No Null values allowed.\n\n" )
def datetime_column( cls, name, min_datetime=Timestamp.min, max_datetime=Timestamp.max, exists=False, unique=False, ): return cls( name=check.str_param(name, 'name'), constraints=cls.add_configurable_constraints( [ ColumnTypeConstraint({'datetime64[ns]'}), InRangeColumnConstraint(min_datetime, max_datetime), ], exists=exists, unique=unique, ), )
def numeric_column( name, expected_dtypes, min_value=-float('inf'), max_value=float('inf'), non_nullable=False, unique=False, ): return PandasColumn( name=check.str_param(name, 'name'), constraints=[ ColumnTypeConstraint(expected_dtypes), InRangeColumnConstraint( check.numeric_param(min_value, 'min_value'), check.numeric_param(max_value, 'max_value'), ), ] + _construct_keyword_constraints(non_nullable=non_nullable, unique=unique), )
def numeric_column( cls, name, expected_dtypes, min_value=-float('inf'), max_value=float('inf'), exists=False, unique=False, ): return cls( name=check.str_param(name, 'name'), constraints=cls.add_configurable_constraints( [ ColumnTypeConstraint(expected_dtypes), InRangeColumnConstraint( check.numeric_param(min_value, 'min_value'), check.numeric_param(max_value, 'max_value'), ), ], exists=exists, unique=unique, ), )
def test_validate_collection_schema_ok(): collection_schema = [ PandasColumn(name='foo', constraints=[ColumnTypeConstraint('object')]), ] dataframe = DataFrame({'foo': ['bar', 'baz']}) assert validate_collection_schema(collection_schema, dataframe) is None
def test_validate_collection_schema_ok(): collection_schema = [ PandasColumn(name='foo', constraints=[ColumnTypeConstraint('object')]), ] dataframe = DataFrame({'foo': ['bar', 'baz']}) assert validate_collection_schema(collection_schema, dataframe) is None @pytest.mark.parametrize( 'collection_schema, dataframe', [ ( [ PandasColumn(name='foo', constraints=[ColumnTypeConstraint('int64')]) ], DataFrame({'foo': ['bar', 'baz']}), ), ( [ PandasColumn(name='foo', constraints=[ColumnTypeConstraint('object')]) ], DataFrame({'bar': ['bar', 'baz']}), ), ], ) def test_validate_collection_schema_throw_error(collection_schema, dataframe): with pytest.raises(ConstraintViolationException): validate_collection_schema(collection_schema, dataframe)
lambda x: x % 5 != 0)] if not rows_with_unexpected_buckets.empty: raise ColumnConstraintViolationException( constraint_name=self.name, constraint_description=self.error_description, column_name=column_name, offending_rows=rows_with_unexpected_buckets, ) CustomTripDataFrame = create_dagster_pandas_dataframe_type( name='CustomTripDataFrame', columns=[ PandasColumn('amount_paid', constraints=[ ColumnTypeConstraint('int64'), DivisibleByFiveConstraint() ]) ], ) @solid( output_defs=[ OutputDefinition(name='custom_trip_dataframe', dagster_type=CustomTripDataFrame) ], ) def load_custom_trip_dataframe(_) -> DataFrame: return read_csv( script_relative_path('./ebike_trips.csv'), parse_dates=['start_time', 'end_time'],