def test_in_range_value_column_constraint(): test_dataframe = DataFrame({'foo': [1, 1, 2, 3]}) assert (InRangeColumnConstraint(1, 4, ignore_missing_vals=False).validate( test_dataframe, 'foo') is None) with pytest.raises(ConstraintViolationException): InRangeColumnConstraint(2, 3, ignore_missing_vals=False).validate( test_dataframe, 'foo')
def test_in_range_value_column_constraint_ignore_nan(): for nullable in NAN_VALUES: test_dataframe = DataFrame({'foo': [1, 1, 2, 3, nullable]}) assert (InRangeColumnConstraint(1, 4, ignore_missing_vals=True).validate( test_dataframe, 'foo') is None) with pytest.raises(ConstraintViolationException): InRangeColumnConstraint(2, 3, ignore_missing_vals=True).validate( test_dataframe, 'foo')
def datetime_column( name, min_datetime=Timestamp.min, max_datetime=Timestamp.max, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None, tz=None, ): """ Simple constructor for PandasColumns that expresses datetime constraints on 'datetime64[ns]' dtypes. Args: name (str): Name of the column. This must match up with the column name in the dataframe you expect to receive. min_datetime (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to pandas.Timestamp.min. max_datetime (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to pandas.Timestamp.max. non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column ought to be non null values. unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values. ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True. is_required (Optional[bool]): Flag indicating the optional/required presence of the column. If the column exists the validate function will validate the column. Default to True. tz (Optional[str]): Required timezone for values eg: tz='UTC', tz='Europe/Dublin', tz='US/Eastern'. Defaults to None, meaning naive datetime values. """ if tz is None: datetime_constraint = ColumnDTypeInSetConstraint({"datetime64[ns]"}) else: datetime_constraint = ColumnDTypeInSetConstraint({f"datetime64[ns, {tz}]"}) # One day more/less than absolute min/max to prevent OutOfBoundsDatetime errors when converting min/max to be tz aware if min_datetime.replace(tzinfo=None) == Timestamp.min: min_datetime = Timestamp("1677-09-22 00:12:43.145225Z") if max_datetime.replace(tzinfo=None) == Timestamp.max: max_datetime = Timestamp("2262-04-10 23:47:16.854775807Z") # Convert bounds to same tz if Timestamp(min_datetime).tz is None: min_datetime = Timestamp(min_datetime).tz_localize(tz) if Timestamp(max_datetime).tz is None: max_datetime = Timestamp(max_datetime).tz_localize(tz) return PandasColumn( name=check.str_param(name, "name"), constraints=[ datetime_constraint, InRangeColumnConstraint( min_datetime, max_datetime, ignore_missing_vals=ignore_missing_vals ), ] + _construct_keyword_constraints( non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals ), is_required=is_required, )
def datetime_column( name, min_datetime=Timestamp.min, max_datetime=Timestamp.max, non_nullable=False, unique=False, ): return PandasColumn( name=check.str_param(name, 'name'), constraints=[ ColumnTypeConstraint({'datetime64[ns]'}), InRangeColumnConstraint(min_datetime, max_datetime), ] + _construct_keyword_constraints(non_nullable=non_nullable, unique=unique), )
def test_dataframe_description_generation_multi_constraints(): TestDataFrame = create_dagster_pandas_dataframe_type( name="TestDataFrame", columns=[ PandasColumn( name="foo", constraints=[ ColumnDTypeInSetConstraint({"int64"}), InRangeColumnConstraint(0, 100, ignore_missing_vals=False), NonNullableColumnConstraint(), ], ), ], ) assert ( TestDataFrame.description == "\n### Columns\n**foo**: `int64`\n+ 0 < values < 100\n+ No Null values allowed.\n\n" )
def test_dataframe_description_generation_multi_constraints(): TestDataFrame = create_dagster_pandas_dataframe_type( name='TestDataFrame', columns=[ PandasColumn( name='foo', constraints=[ ColumnTypeConstraint('int64'), InRangeColumnConstraint(0, 100), NonNullableColumnConstraint(), ], ), ], ) assert ( TestDataFrame.description == "\n### Columns\n**foo**: `int64`\n+ 0 < values < 100\n+ No Null values allowed.\n\n" )
def numeric_column( name, expected_dtypes, min_value=-float('inf'), max_value=float('inf'), non_nullable=False, unique=False, ): return PandasColumn( name=check.str_param(name, 'name'), constraints=[ ColumnTypeConstraint(expected_dtypes), InRangeColumnConstraint( check.numeric_param(min_value, 'min_value'), check.numeric_param(max_value, 'max_value'), ), ] + _construct_keyword_constraints(non_nullable=non_nullable, unique=unique), )
def datetime_column( cls, name, min_datetime=Timestamp.min, max_datetime=Timestamp.max, exists=False, unique=False, ): return cls( name=check.str_param(name, 'name'), constraints=cls.add_configurable_constraints( [ ColumnTypeConstraint({'datetime64[ns]'}), InRangeColumnConstraint(min_datetime, max_datetime), ], exists=exists, unique=unique, ), )
def datetime_column( name, min_datetime=Timestamp.min, max_datetime=Timestamp.max, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None, ): """ Simple constructor for PandasColumns that expresses datetime constraints on 'datetime64[ns]' dtypes. Args: name (str): Name of the column. This must match up with the column name in the dataframe you expect to receive. min_datetime (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to pandas.Timestamp.min. max_datetime (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to pandas.Timestamp.max. non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column ought to be non null values. unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values. ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True. is_required (Optional[bool]): Flag indicating the optional/required presence of the column. If the column exists the validate function will validate the column. Default to True. """ return PandasColumn( name=check.str_param(name, "name"), constraints=[ ColumnDTypeInSetConstraint({"datetime64[ns]"}), InRangeColumnConstraint( min_datetime, max_datetime, ignore_missing_vals=ignore_missing_vals), ] + _construct_keyword_constraints( non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals), is_required=is_required, )
def float_column( name, min_value=-float("inf"), max_value=float("inf"), non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None, ): """ Simple constructor for PandasColumns that expresses numeric constraints on float dtypes. Args: name (str): Name of the column. This must match up with the column name in the dataframe you expect to receive. min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf') max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf') non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column ought to be non null values. unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values. ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True. is_required (Optional[bool]): Flag indicating the optional/required presence of the column. If the column exists the validate function will validate the column. Default to True. """ return PandasColumn( name=check.str_param(name, "name"), constraints=[ ColumnDTypeFnConstraint(is_float_dtype), InRangeColumnConstraint( check.numeric_param(min_value, "min_value"), check.numeric_param(max_value, "max_value"), ignore_missing_vals=ignore_missing_vals, ), ] + _construct_keyword_constraints( non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals), is_required=is_required, )
def numeric_column( cls, name, expected_dtypes, min_value=-float('inf'), max_value=float('inf'), exists=False, unique=False, ): return cls( name=check.str_param(name, 'name'), constraints=cls.add_configurable_constraints( [ ColumnTypeConstraint(expected_dtypes), InRangeColumnConstraint( check.numeric_param(min_value, 'min_value'), check.numeric_param(max_value, 'max_value'), ), ], exists=exists, unique=unique, ), )
def test_in_range_value_column_constraint(): test_dataframe = DataFrame({'foo': [1, 1, 2, 3]}) assert InRangeColumnConstraint(1, 4).validate(test_dataframe, 'foo') is None with pytest.raises(ConstraintViolationException): assert InRangeColumnConstraint(2, 3).validate(test_dataframe, 'foo')