Example #1
0
    def test_is_valid_strict_true(self):
        """Test the ``GreaterThan.is_valid`` method with strict True.

        If strict is True, equal values should count as invalid.

        Input:
        - Table with a strictly valid row, a strictly invalid row and
          a row that has the same value for both high and low.
        Output:
        - True should be returned for the strictly valid row and False
          for the other two.
        """
        # Setup
        instance = GreaterThan(low='a', high='b', strict=False)

        # Run
        table_data = pd.DataFrame({
            'a': [1, 2, 3],
            'b': [4, 2, 2],
            'c': [7, 8, 9]
        })
        out = instance.is_valid(table_data)

        # Assert
        expected_out = pd.Series([True, True, False])
        pd.testing.assert_series_equal(expected_out, out)
Example #2
0
    def test_transform_float(self):
        """Test the ``GreaterThan.transform`` method passing a high column of type float.

        The ``GreaterThan.transform`` method is expected to compute the distance
        between the high and low columns and replace the high column with the
        logarithm of the distance + 1.

        Input:
        - Table with two constrained columns at a constant distance of
          exactly 3 and one additional dummy column.
        Output:
        - Same table with the high column transformed into the logarithms
          of the distances + 1, which is np.log(4).
        """
        # Setup
        instance = GreaterThan(low='a', high='b', strict=True)

        # Run
        table_data = pd.DataFrame({
            'a': [1, 2, 3],
            'b': [4., 5., 6.],
            'c': [7, 8, 9],
        })
        out = instance.transform(table_data)

        # Assert
        expected_out = pd.DataFrame({
            'a': [1, 2, 3],
            'b': [np.log(4)] * 3,
            'c': [7, 8, 9],
        })
        pd.testing.assert_frame_equal(out, expected_out)
Example #3
0
    def test_fit_float(self):
        """Test the ``GreaterThan.fit`` method.

        The ``GreaterThan.fit`` method should only learn and store the
        ``dtype`` of the ``high`` column as the ``_dtype`` attribute.

        Input:
        - Table that contains two constrained columns with the high one
          being made of float values.
        Side Effect:
        - The _dtype attribute gets `float` as the value even if the low
          column has a different dtype.
        """
        # Setup
        instance = GreaterThan(low='a', high='b')

        # Run
        table_data = pd.DataFrame({
            'a': [1, 2, 3],
            'b': [4., 5., 6.],
            'c': [7, 8, 9]
        })
        instance.fit(table_data)

        # Asserts
        assert instance._dtype.kind == 'f'
Example #4
0
    def test_reverse_transform(self):
        """Test the ``GreaterThan.reverse_transform`` method.

        The ``GreaterThan.reverse_transform`` method is expected to:
        - Return the original table data.

        Input:
        - Table data transformed (pandas.DataFrame)
        Output:
        - Table data (pandas.DataFrame)
        Side effects:
        - Since ``reverse_transform`` uses the class variable ``_dtype``, the ``fit`` method
        must be called as well.
        """
        # Setup
        table_data = pd.DataFrame({
            'a': [1, 2, 3],
            'b': [4, 5, 6],
            'c': [7, 8, 9]
        })
        instance = GreaterThan(low='a', high='b', strict=True)
        instance.fit(table_data)

        # Run
        out = instance.reverse_transform(table_data)

        # Assert
        expected_out = pd.DataFrame({
            'a': [1, 2, 3],
            'b': [55, 149, 405],
            'c': [7, 8, 9],
        })
        pd.testing.assert_frame_equal(out, expected_out)
Example #5
0
    def test_reverse_transform_int(self):
        """Test the ``GreaterThan.reverse_transform`` method for dtype int.

        The ``GreaterThan.reverse_transform`` method is expected to:
            - apply an exponential to the input
            - subtract 1
            - add the low column
            - convert the output to integers

        Input:
        - Table with a high column that contains the constant np.log(4).
        Output:
        - Same table with the high column replaced by the low one + 3, as int.
        """
        # Setup
        instance = GreaterThan(low='a', high='b', strict=True)
        instance._dtype = pd.Series(
            [1]).dtype  # exact dtype (32 or 64) depends on OS

        # Run
        transformed = pd.DataFrame({
            'a': [1, 2, 3],
            'b': [np.log(4)] * 3,
            'c': [7, 8, 9]
        })
        out = instance.reverse_transform(transformed)

        # Assert
        expected_out = pd.DataFrame({
            'a': [1, 2, 3],
            'b': [4, 5, 6],
            'c': [7, 8, 9],
        })
        pd.testing.assert_frame_equal(out, expected_out)
Example #6
0
    def test_reverse_transform_float(self):
        """Test the ``GreaterThan.reverse_transform`` method for dtype float.

        The ``GreaterThan.reverse_transform`` method is expected to:
            - apply an exponential to the input
            - subtract 1
            - add the low column
            - convert the output to float values

        Input:
        - Table with a high column that contains the constant np.log(4).
        Output:
        - Same table with the high column replaced by the low one + 3, as float values.
        """
        # Setup
        instance = GreaterThan(low='a', high='b', strict=True)
        instance._dtype = np.dtype('float')

        # Run
        transformed = pd.DataFrame({
            'a': [1.1, 2.2, 3.3],
            'b': [np.log(4)] * 3,
            'c': [7, 8, 9]
        })
        out = instance.reverse_transform(transformed)

        # Assert
        expected_out = pd.DataFrame({
            'a': [1.1, 2.2, 3.3],
            'b': [4.1, 5.2, 6.3],
            'c': [7, 8, 9],
        })
        pd.testing.assert_frame_equal(out, expected_out)
Example #7
0
    def test_transform(self):
        """Test the ``GreaterThan.transform`` method.

        The ``GreaterThan.transform`` method is expected to:
        - Transform the original table data.

        Input:
        - Table data (pandas.DataFrame)
        Output:
        - Table data transformed (pandas.DataFrame)
        """
        # Setup
        instance = GreaterThan(low='a', high='b', strict=True)

        # Run
        table_data = pd.DataFrame({
            'a': [1, 2, 3],
            'b': [4, 5, 6],
        })
        out = instance.transform(table_data)

        # Assert
        expected_out = pd.DataFrame({
            'a': [1, 2, 3],
            'b': [1.3862944, 1.3862944, 1.3862944]
        })
        pd.testing.assert_frame_equal(out, expected_out)
Example #8
0
    def test_is_valid_false_not_strict(self):
        """Test the ``GreaterThan.is_valid`` method when the column values are not valid
        and the comparison is not strict.

        If the columns do not satisfy the costraint, result is a series of ``False`` values.

        Input:
        - Table data, where the values of the ``low`` column are higher
        than the values of the ``high`` column (pandas.DataFrame)
        Output:
        - Series of ``True`` values (pandas.Series)
        """
        # Setup
        instance = GreaterThan(low='a', high='b')

        # Run
        table_data = pd.DataFrame({
            'a': [1, 2, 3],
            'b': [0, 1, 2],
            'c': [7, 8, 9]
        })
        out = instance.is_valid(table_data)

        # Assert
        expected_out = pd.Series([False, False, False])
        pd.testing.assert_series_equal(expected_out, out)
Example #9
0
    def test_transform_datetime(self):
        """Test the ``GreaterThan.transform`` method passing a high column of type datetime.

        If the columns are of type datetime, ``transform`` is expected
        to convert the timedelta distance into numeric before applying
        the +1 and logarithm.

        Input:
        - Table with values at a distance of exactly 1 second.
        Output:
        - Same table with the high column transformed into the logarithms
          of the dinstance in nanoseconds + 1, which is np.log(1_000_000_001).
        """
        # Setup
        instance = GreaterThan(low='a', high='b', strict=True)

        # Run
        table_data = pd.DataFrame({
            'a':
            pd.to_datetime(['2020-01-01T00:00:00', '2020-01-02T00:00:00']),
            'b':
            pd.to_datetime(['2020-01-01T00:00:01', '2020-01-02T00:00:01']),
            'c': [1, 2],
        })
        out = instance.transform(table_data)

        # Assert
        expected_out = pd.DataFrame({
            'a':
            pd.to_datetime(['2020-01-01T00:00:00', '2020-01-02T00:00:00']),
            'b': [np.log(1_000_000_001),
                  np.log(1_000_000_001)],
            'c': [1, 2],
        })
Example #10
0
    def test_reverse_transform_datetime(self):
        """Test the ``GreaterThan.reverse_transform`` method for dtype datetime.

        The ``GreaterThan.reverse_transform`` method is expected to:
            - apply an exponential to the input
            - subtract 1
            - convert the distance to a timedelta
            - add the low column
            - convert the output to datetimes

        Input:
        - Table with a high column that contains the constant np.log(1_000_000_001).
        Output:
        - Same table with the high column replaced by the low one + one second.
        """
        # Setup
        instance = GreaterThan(low='a', high='b', strict=True)
        instance._dtype = np.dtype('<M8[ns]')

        # Run
        transformed = pd.DataFrame({
            'a':
            pd.to_datetime(['2020-01-01T00:00:00', '2020-01-02T00:00:00']),
            'b': [np.log(1_000_000_001),
                  np.log(1_000_000_001)],
            'c': [1, 2]
        })
Example #11
0
    def test_transform_not_all_columns_provided(self):
        """Test the ``GreaterThan.transform`` method.

        If some of the columns needed for the transform are missing, it will raise
        a ``MissingConstraintColumnError``.

        Input:
        - Table data (pandas.DataFrame)
        Output:
        - Raises ``MissingConstraintColumnError``.
        """
        # Setup
        instance = GreaterThan(low='a', high='b', strict=True)

        # Run/Assert
        with pytest.raises(MissingConstraintColumnError):
            instance.transform(pd.DataFrame({'a': ['a', 'b', 'c']}))
Example #12
0
    def test_fit_datetime(self):
        """Test the ``GreaterThan.fit`` method.

        The ``GreaterThan.fit`` method should only learn and store the
        ``dtype`` of the ``high`` column as the ``_dtype`` attribute.

        Input:
        - Table that contains two constrained columns of datetimes.
        Side Effect:
        - The _dtype attribute gets `datetime` as the value.
        """
        # Setup
        instance = GreaterThan(low='a', high='b')

        # Run
        table_data = pd.DataFrame({
            'a': pd.to_datetime(['2020-01-01']),
            'b': pd.to_datetime(['2020-01-02'])
        })
        instance.fit(table_data)

        # Asserts
        assert instance._dtype.kind == 'M'
Example #13
0
    def test_fit(self):
        """Test the ``GreaterThan.fit`` method.

        It is expected to return the dtype of the ``high`` column.

        Input:
        - Table data (pandas.DataFrame)
        Output:
        - dtype of the ``high`` column.
        """
        # Setup
        instance = GreaterThan(low='a', high='b')

        # Run
        table_data = pd.DataFrame({
            'a': [1, 2, 3],
            'b': [4, 5, 6],
            'c': [7, 8, 9]
        })
        instance.fit(table_data)

        # Asserts
        expected = table_data['b'].dtype
        assert instance._dtype == expected
Example #14
0
    def test___init___strict_false(self):
        """Test the ``GreaterThan.__init__`` method.

        The passed arguments should be stored as attributes.

        Input:
        - low = 'a'
        - high = 'b'
        Side effects:
        - instance._low == 'a'
        - instance._high == 'b'
        - instance._strict == False
        """
        # Run
        instance = GreaterThan(low='a', high='b')

        # Asserts
        assert instance._low == 'a'
        assert instance._high == 'b'
        assert instance._strict is False
Example #15
0
    def test___init___strict_false(self):
        """Test the ``GreaterThan.__init__`` method.

        It is expected to create a new Constraint instance and receiving ``low`` and ``high``,
        names of the columns that contain the low and high value.

        Input:
        - low = 'a'
        - high = 'b'
        Side effects:
        - instance._low == 'a'
        - instance._high == 'b'
        - instance._strict == False
        """
        # Run
        instance = GreaterThan(low='a', high='b')

        # Asserts
        assert instance._low == 'a'
        assert instance._high == 'b'
        assert instance._strict is False
Example #16
0
    def test___init___strict_true(self):
        """Test the ``GreaterThan.__init__`` method.

        It is expected to create a new Constraint instance and receiving ``low`` and ``high``,
        names of the columns that contain the low and high value. It also receives ``strict``,
        a bool that indicates the comparison of the values should be strict.

        Input:
        - low = 'a'
        - high = 'b'
        - strict = True
        Side effects:
        - instance._low == 'a'
        - instance._high == 'b'
        - instance._stric == True
        """
        # Run
        instance = GreaterThan(low='a', high='b', strict=True)

        # Asserts
        assert instance._low == 'a'
        assert instance._high == 'b'
        assert instance._strict is True
Example #17
0
def test_conditional_sampling_constraint_uses_columns_model_reject_sampling(
        column_model_mock):
    """Test that the ``sample`` method handles constraints with conditions.

    The ``sample`` method is expected to properly apply constraint
    transformations by sampling the missing columns for the constraint
    if ``fit_columns_model`` is True. All values sampled by the column
    model should be valid because reject sampling is used on any that aren't.

    Setup:
    - The model is being passed a ``GreaterThan`` constraint and then
    asked to sample with one condition. One of the constraint columns is
    the conditioned column. The ``GaussianMultivariate`` class is mocked
    so that the constraint's ``_column_model`` returns some invalid rows
    in order to test that the reject sampling is used.

    Input:
    - Conditions
    Side Effects:
    - Correct columns to condition on are passed to underlying sample method
    """
    # Setup
    constraint = GreaterThan(low='age_joined',
                             high='age',
                             handling_strategy='transform',
                             fit_columns_model=True,
                             drop='high')
    data = pd.DataFrame({
        'age_joined': [22.0, 21.0, 15.0, 18.0, 29.0],
        'age': [27.0, 28.0, 26.0, 21.0, 30.0],
        'experience_years': [6.0, 7.0, 11.0, 3.0, 7.0],
    })
    model = GaussianCopula(constraints=[constraint])
    sampled_conditions = [
        pd.DataFrame({
            'age_joined': [26.0, 18.0, 31.0, 29.0, 32.0],
            'age': [30.0, 30.0, 30.0, 30.0, 30.0]
        }),
        pd.DataFrame({
            'age_joined': [28.0, 33.0, 31.0],
            'age': [30.0, 30.0, 30.0]
        }),
        pd.DataFrame({
            'age_joined': [27.0],
            'age': [30.0]
        })
    ]

    column_model_mock.return_value.sample.side_effect = sampled_conditions
    model.fit(data)

    # Run
    conditions = {'age': 30.0}
    sampled_data = model.sample(5, conditions=conditions)

    # Assert
    assert len(column_model_mock.return_value.sample.mock_calls) == 3

    expected_result = pd.DataFrame({
        'age_joined': [26.0, 18.0, 29.0, 28.0, 27.0],
        'age': [30.0, 30.0, 30.0, 30.0, 30.0]
    })
    pd.testing.assert_frame_equal(
        sampled_data[['age_joined', 'age']],
        expected_result[['age_joined', 'age']],
    )