def test_is_valid_strict_true(self): """Test the ``GreaterThan.is_valid`` method with strict True. If strict is True, equal values should count as invalid. Input: - Table with a strictly valid row, a strictly invalid row and a row that has the same value for both high and low. Output: - True should be returned for the strictly valid row and False for the other two. """ # Setup instance = GreaterThan(low='a', high='b', strict=False) # Run table_data = pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 2, 2], 'c': [7, 8, 9] }) out = instance.is_valid(table_data) # Assert expected_out = pd.Series([True, True, False]) pd.testing.assert_series_equal(expected_out, out)
def test_transform_float(self): """Test the ``GreaterThan.transform`` method passing a high column of type float. The ``GreaterThan.transform`` method is expected to compute the distance between the high and low columns and replace the high column with the logarithm of the distance + 1. Input: - Table with two constrained columns at a constant distance of exactly 3 and one additional dummy column. Output: - Same table with the high column transformed into the logarithms of the distances + 1, which is np.log(4). """ # Setup instance = GreaterThan(low='a', high='b', strict=True) # Run table_data = pd.DataFrame({ 'a': [1, 2, 3], 'b': [4., 5., 6.], 'c': [7, 8, 9], }) out = instance.transform(table_data) # Assert expected_out = pd.DataFrame({ 'a': [1, 2, 3], 'b': [np.log(4)] * 3, 'c': [7, 8, 9], }) pd.testing.assert_frame_equal(out, expected_out)
def test_fit_float(self): """Test the ``GreaterThan.fit`` method. The ``GreaterThan.fit`` method should only learn and store the ``dtype`` of the ``high`` column as the ``_dtype`` attribute. Input: - Table that contains two constrained columns with the high one being made of float values. Side Effect: - The _dtype attribute gets `float` as the value even if the low column has a different dtype. """ # Setup instance = GreaterThan(low='a', high='b') # Run table_data = pd.DataFrame({ 'a': [1, 2, 3], 'b': [4., 5., 6.], 'c': [7, 8, 9] }) instance.fit(table_data) # Asserts assert instance._dtype.kind == 'f'
def test_reverse_transform(self): """Test the ``GreaterThan.reverse_transform`` method. The ``GreaterThan.reverse_transform`` method is expected to: - Return the original table data. Input: - Table data transformed (pandas.DataFrame) Output: - Table data (pandas.DataFrame) Side effects: - Since ``reverse_transform`` uses the class variable ``_dtype``, the ``fit`` method must be called as well. """ # Setup table_data = pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9] }) instance = GreaterThan(low='a', high='b', strict=True) instance.fit(table_data) # Run out = instance.reverse_transform(table_data) # Assert expected_out = pd.DataFrame({ 'a': [1, 2, 3], 'b': [55, 149, 405], 'c': [7, 8, 9], }) pd.testing.assert_frame_equal(out, expected_out)
def test_reverse_transform_int(self): """Test the ``GreaterThan.reverse_transform`` method for dtype int. The ``GreaterThan.reverse_transform`` method is expected to: - apply an exponential to the input - subtract 1 - add the low column - convert the output to integers Input: - Table with a high column that contains the constant np.log(4). Output: - Same table with the high column replaced by the low one + 3, as int. """ # Setup instance = GreaterThan(low='a', high='b', strict=True) instance._dtype = pd.Series( [1]).dtype # exact dtype (32 or 64) depends on OS # Run transformed = pd.DataFrame({ 'a': [1, 2, 3], 'b': [np.log(4)] * 3, 'c': [7, 8, 9] }) out = instance.reverse_transform(transformed) # Assert expected_out = pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9], }) pd.testing.assert_frame_equal(out, expected_out)
def test_reverse_transform_float(self): """Test the ``GreaterThan.reverse_transform`` method for dtype float. The ``GreaterThan.reverse_transform`` method is expected to: - apply an exponential to the input - subtract 1 - add the low column - convert the output to float values Input: - Table with a high column that contains the constant np.log(4). Output: - Same table with the high column replaced by the low one + 3, as float values. """ # Setup instance = GreaterThan(low='a', high='b', strict=True) instance._dtype = np.dtype('float') # Run transformed = pd.DataFrame({ 'a': [1.1, 2.2, 3.3], 'b': [np.log(4)] * 3, 'c': [7, 8, 9] }) out = instance.reverse_transform(transformed) # Assert expected_out = pd.DataFrame({ 'a': [1.1, 2.2, 3.3], 'b': [4.1, 5.2, 6.3], 'c': [7, 8, 9], }) pd.testing.assert_frame_equal(out, expected_out)
def test_transform(self): """Test the ``GreaterThan.transform`` method. The ``GreaterThan.transform`` method is expected to: - Transform the original table data. Input: - Table data (pandas.DataFrame) Output: - Table data transformed (pandas.DataFrame) """ # Setup instance = GreaterThan(low='a', high='b', strict=True) # Run table_data = pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6], }) out = instance.transform(table_data) # Assert expected_out = pd.DataFrame({ 'a': [1, 2, 3], 'b': [1.3862944, 1.3862944, 1.3862944] }) pd.testing.assert_frame_equal(out, expected_out)
def test_is_valid_false_not_strict(self): """Test the ``GreaterThan.is_valid`` method when the column values are not valid and the comparison is not strict. If the columns do not satisfy the costraint, result is a series of ``False`` values. Input: - Table data, where the values of the ``low`` column are higher than the values of the ``high`` column (pandas.DataFrame) Output: - Series of ``True`` values (pandas.Series) """ # Setup instance = GreaterThan(low='a', high='b') # Run table_data = pd.DataFrame({ 'a': [1, 2, 3], 'b': [0, 1, 2], 'c': [7, 8, 9] }) out = instance.is_valid(table_data) # Assert expected_out = pd.Series([False, False, False]) pd.testing.assert_series_equal(expected_out, out)
def test_transform_datetime(self): """Test the ``GreaterThan.transform`` method passing a high column of type datetime. If the columns are of type datetime, ``transform`` is expected to convert the timedelta distance into numeric before applying the +1 and logarithm. Input: - Table with values at a distance of exactly 1 second. Output: - Same table with the high column transformed into the logarithms of the dinstance in nanoseconds + 1, which is np.log(1_000_000_001). """ # Setup instance = GreaterThan(low='a', high='b', strict=True) # Run table_data = pd.DataFrame({ 'a': pd.to_datetime(['2020-01-01T00:00:00', '2020-01-02T00:00:00']), 'b': pd.to_datetime(['2020-01-01T00:00:01', '2020-01-02T00:00:01']), 'c': [1, 2], }) out = instance.transform(table_data) # Assert expected_out = pd.DataFrame({ 'a': pd.to_datetime(['2020-01-01T00:00:00', '2020-01-02T00:00:00']), 'b': [np.log(1_000_000_001), np.log(1_000_000_001)], 'c': [1, 2], })
def test_reverse_transform_datetime(self): """Test the ``GreaterThan.reverse_transform`` method for dtype datetime. The ``GreaterThan.reverse_transform`` method is expected to: - apply an exponential to the input - subtract 1 - convert the distance to a timedelta - add the low column - convert the output to datetimes Input: - Table with a high column that contains the constant np.log(1_000_000_001). Output: - Same table with the high column replaced by the low one + one second. """ # Setup instance = GreaterThan(low='a', high='b', strict=True) instance._dtype = np.dtype('<M8[ns]') # Run transformed = pd.DataFrame({ 'a': pd.to_datetime(['2020-01-01T00:00:00', '2020-01-02T00:00:00']), 'b': [np.log(1_000_000_001), np.log(1_000_000_001)], 'c': [1, 2] })
def test_transform_not_all_columns_provided(self): """Test the ``GreaterThan.transform`` method. If some of the columns needed for the transform are missing, it will raise a ``MissingConstraintColumnError``. Input: - Table data (pandas.DataFrame) Output: - Raises ``MissingConstraintColumnError``. """ # Setup instance = GreaterThan(low='a', high='b', strict=True) # Run/Assert with pytest.raises(MissingConstraintColumnError): instance.transform(pd.DataFrame({'a': ['a', 'b', 'c']}))
def test_fit_datetime(self): """Test the ``GreaterThan.fit`` method. The ``GreaterThan.fit`` method should only learn and store the ``dtype`` of the ``high`` column as the ``_dtype`` attribute. Input: - Table that contains two constrained columns of datetimes. Side Effect: - The _dtype attribute gets `datetime` as the value. """ # Setup instance = GreaterThan(low='a', high='b') # Run table_data = pd.DataFrame({ 'a': pd.to_datetime(['2020-01-01']), 'b': pd.to_datetime(['2020-01-02']) }) instance.fit(table_data) # Asserts assert instance._dtype.kind == 'M'
def test_fit(self): """Test the ``GreaterThan.fit`` method. It is expected to return the dtype of the ``high`` column. Input: - Table data (pandas.DataFrame) Output: - dtype of the ``high`` column. """ # Setup instance = GreaterThan(low='a', high='b') # Run table_data = pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9] }) instance.fit(table_data) # Asserts expected = table_data['b'].dtype assert instance._dtype == expected
def test___init___strict_false(self): """Test the ``GreaterThan.__init__`` method. The passed arguments should be stored as attributes. Input: - low = 'a' - high = 'b' Side effects: - instance._low == 'a' - instance._high == 'b' - instance._strict == False """ # Run instance = GreaterThan(low='a', high='b') # Asserts assert instance._low == 'a' assert instance._high == 'b' assert instance._strict is False
def test___init___strict_false(self): """Test the ``GreaterThan.__init__`` method. It is expected to create a new Constraint instance and receiving ``low`` and ``high``, names of the columns that contain the low and high value. Input: - low = 'a' - high = 'b' Side effects: - instance._low == 'a' - instance._high == 'b' - instance._strict == False """ # Run instance = GreaterThan(low='a', high='b') # Asserts assert instance._low == 'a' assert instance._high == 'b' assert instance._strict is False
def test___init___strict_true(self): """Test the ``GreaterThan.__init__`` method. It is expected to create a new Constraint instance and receiving ``low`` and ``high``, names of the columns that contain the low and high value. It also receives ``strict``, a bool that indicates the comparison of the values should be strict. Input: - low = 'a' - high = 'b' - strict = True Side effects: - instance._low == 'a' - instance._high == 'b' - instance._stric == True """ # Run instance = GreaterThan(low='a', high='b', strict=True) # Asserts assert instance._low == 'a' assert instance._high == 'b' assert instance._strict is True
def test_conditional_sampling_constraint_uses_columns_model_reject_sampling( column_model_mock): """Test that the ``sample`` method handles constraints with conditions. The ``sample`` method is expected to properly apply constraint transformations by sampling the missing columns for the constraint if ``fit_columns_model`` is True. All values sampled by the column model should be valid because reject sampling is used on any that aren't. Setup: - The model is being passed a ``GreaterThan`` constraint and then asked to sample with one condition. One of the constraint columns is the conditioned column. The ``GaussianMultivariate`` class is mocked so that the constraint's ``_column_model`` returns some invalid rows in order to test that the reject sampling is used. Input: - Conditions Side Effects: - Correct columns to condition on are passed to underlying sample method """ # Setup constraint = GreaterThan(low='age_joined', high='age', handling_strategy='transform', fit_columns_model=True, drop='high') data = pd.DataFrame({ 'age_joined': [22.0, 21.0, 15.0, 18.0, 29.0], 'age': [27.0, 28.0, 26.0, 21.0, 30.0], 'experience_years': [6.0, 7.0, 11.0, 3.0, 7.0], }) model = GaussianCopula(constraints=[constraint]) sampled_conditions = [ pd.DataFrame({ 'age_joined': [26.0, 18.0, 31.0, 29.0, 32.0], 'age': [30.0, 30.0, 30.0, 30.0, 30.0] }), pd.DataFrame({ 'age_joined': [28.0, 33.0, 31.0], 'age': [30.0, 30.0, 30.0] }), pd.DataFrame({ 'age_joined': [27.0], 'age': [30.0] }) ] column_model_mock.return_value.sample.side_effect = sampled_conditions model.fit(data) # Run conditions = {'age': 30.0} sampled_data = model.sample(5, conditions=conditions) # Assert assert len(column_model_mock.return_value.sample.mock_calls) == 3 expected_result = pd.DataFrame({ 'age_joined': [26.0, 18.0, 29.0, 28.0, 27.0], 'age': [30.0, 30.0, 30.0, 30.0, 30.0] }) pd.testing.assert_frame_equal( sampled_data[['age_joined', 'age']], expected_result[['age_joined', 'age']], )