Esempio n. 1
0
 def detect(self,
            constraints,
            VerificationClass=Verification,
            outpath=None,
            write_all=False,
            per_constraint=False,
            output_fields=None,
            rownumber=False,
            in_place=False,
            **kwargs):
     """
     Apply verifiers to a set of constraints, for detection
     """
     return verify(constraints,
                   self.get_column_names(),
                   self.verifiers(),
                   VerificationClass=VerificationClass,
                   detect=True,
                   detect_outpath=outpath,
                   detect_write_all=write_all,
                   detect_per_constraint=per_constraint,
                   detect_output_fields=output_fields,
                   detect_rownumber=rownumber,
                   detect_in_place=in_place,
                   detected_records_writer=self.write_detected_records,
                   **kwargs)
Esempio n. 2
0
 def verify(self, constraints, VerificationClass=Verification, **kwargs):
     """
     Apply verifiers to a set of constraints, for reporting
     """
     return verify(constraints, self.get_column_names(), self.verifiers(),
                   VerificationClass=VerificationClass,
                   detected_records_writer=self.write_detected_records,
                   **kwargs)
Esempio n. 3
0
 def verify(self, constraints, VerificationClass=Verification, **kwargs):
     """
     Apply verifiers to a set of constraints, for reporting
     """
     return verify(constraints,
                   self.get_column_names(),
                   self.verifiers(),
                   VerificationClass=VerificationClass,
                   detected_records_writer=self.write_detected_records,
                   **kwargs)
Esempio n. 4
0
def verify_df(df, constraints_path, epsilon=None, type_checking=None,
              **kwargs):
    """
    Verify that (i.e. check whether) the Pandas DataFrame provided
    satisfies the constraints in the JSON .tdda file provided.

    Mandatory Inputs:

        *df*:
                            A Pandas DataFrame, to be checked.

        *constraints_path*:
                            The path to a JSON .tdda file (possibly
                            generated by the discover_constraints
                            function, below) containing constraints
                            to be checked.

    Optional Inputs:

        *epsilon*:
                            When checking minimum and maximum values
                            for numeric fields, this provides a
                            tolerance. The tolerance is a proportion
                            of the constraint value by which the
                            constraint can be exceeded without causing
                            a constraint violation to be issued.
                            With the default value of epsilon
                            (:py:const:`EPSILON_DEFAULT` = 0.01, i.e. 1%),
                            values can be up to 1% larger than a max constraint
                            without generating constraint failure,
                            and minimum values can be up to 1% smaller
                            that the minimum constraint value without
                            generating a constraint failure. (These
                            are modified, as appropraite, for negative
                            values.)

                            NOTE: A consequence of the fact that these
                            are proportionate is that min/max values
                            of zero do not have any tolerance, i.e.
                            the wrong sign always generates a failure.

        *type_checking*:
                            'strict' or 'sloppy'.
                            Because Pandas silently, routinely and
                            automatically "promotes" integer and boolean
                            columns to reals and objects respectively
                            if they contain nulls, strict type checking
                            can be problematical in Pandas. For this reason,
                            type_checking defaults to 'sloppy', meaning
                            that type changes that could plausibly be
                            attriuted to Pandas type promotion will not
                            generate constraint values.

                            If this is set to strict, a Pandas "float"
                            column c will only be allowed to satisfy a
                            an "int" type constraint if:

                                `c.dropnulls().astype(int) == c.dropnulls()`

                            Similarly, Object fields will satisfy a
                            'bool' constraint only if:

                                `c.dropnulls().astype(bool) == c.dropnulls()`

        *report*:
                            'all' or 'fields'.
                            This controls the behaviour of the
                            :py:meth:`~PandasVerification.__str__` method on
                            the resulting :py:class:`~PandasVerification`
                            object (but not its content).

                            The default is 'all', which means that
                            all fields are shown, together with the
                            verification status of each constraint
                            for that field.

                            If report is set to 'fields', only fields for
                            which at least one constraint failed are shown.

                            NOTE: The method also accepts two further
                            parameters to control (not yet implemented)
                            behaviour. 'constraints', will be used to
                            indicate that only failing constraints for
                            failing fields should be shown.
                            'one_per_line' will indicate that each constraint
                            failure should be reported on a separate line.

    Returns:

        :py:class:`~PandasVerification` object.

        This object has attributes:

            - *passed*      --- Number of passing constriants
            - *failures*    --- Number of failing constraints

        It also has a :py:meth:`~PandasVerification.to_frame()` method for
        converting the results of the verification to a Pandas DataFrame,
        and a :py:meth:`~PandasVerification.__str__` method to print
        both the detailed and summary results of the verification.

    Example usage::

        import pandas as pd
        from tdda.constraints.pdconstraints import verify_df

        df = pd.DataFrame({'a': [0, 1, 2, 10, pd.np.NaN],
                           'b': ['one', 'one', 'two', 'three', pd.np.NaN]})
        v = verify_df(df, 'example_constraints.tdda')

        print('Passes:', v.passes)
        print('Failures: %d\\n' % v.failures)
        print(str(v))
        print(v.to_frame())

    See *simple_verification.py* in the :ref:`constraint_examples`
    for a slightly fuller example.

    """
    pdv = PandasConstraintVerifier(df, epsilon=epsilon,
                                   type_checking=type_checking)
    constraints = DatasetConstraints(loadpath=constraints_path)
    return verify(constraints, pdv.verifiers(),
                  VerificationClass=PandasVerification, **kwargs)
Esempio n. 5
0
    def testFieldVerification(self):
        df1 = pd.DataFrame({
            'b': [True, False] * 2,
            'i':
            range(1, 5),
            'r': [float(x) for x in range(1, 5)],
            's': ['S%s' % x for x in range(1, 5)],
            'd': [datetime.datetime(2016, 1, x) for x in range(1, 5)]
        })
        ic1 = FieldConstraints('i', [
            TypeConstraint('int'),
            MinConstraint(0),
            MaxConstraint(10),
            SignConstraint('positive'),
            MaxNullsConstraint(0),
            NoDuplicatesConstraint()
        ])

        ic2 = FieldConstraints('i', [
            TypeConstraint('bool'),
            MinConstraint(2),
            MaxConstraint(3),
            SignConstraint('negative'),
            MaxNullsConstraint(0),
            NoDuplicatesConstraint()
        ])

        dfc1 = [ic1]
        dsc1 = DatasetConstraints(dfc1)
        pdcv1 = pdc.PandasConstraintVerifier(df1)
        results1 = verify(dsc1, list(df1), pdcv1.verifiers())
        expected = (
            'FIELDS:\n\n'
            'i: 0 failures  6 passes  '
            'type ✓  min ✓  max ✓  sign ✓  '
            'max_nulls ✓  no_duplicates ✓\n\n'
            'SUMMARY:\n\nConstraints passing: 6\nConstraints failing: 0')
        self.assertEqual(str(results1), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [0]),
                ('passes', [6]),
                ('type', [True]),
                ('min', [True]),
                ('max', [True]),
                ('sign', [True]),
                ('max_nulls', [True]),
                ('no_duplicates', [True]),
            )))
        vdf = pdc.PandasVerification.verification_to_dataframe(results1)
        self.assertTrue(vdf.equals(expected))

        df2 = pd.DataFrame({'i': [1, 2, 2, 6, np.nan]})
        dfc2 = [ic2]
        dsc2 = DatasetConstraints(dfc2)
        pdcv2 = pdc.PandasConstraintVerifier(df2)
        results2 = verify(dsc2, list(df2), pdcv2.verifiers())
        # expect the boolean->real type constraint to pass with sloppy types
        expected = (
            'FIELDS:\n\n'
            'i: 5 failures  1 pass  '
            'type ✓  min ✗  max ✗  sign ✗  '
            'max_nulls ✗  no_duplicates ✗\n\n'
            'SUMMARY:\n\nConstraints passing: 1\nConstraints failing: 5')
        self.assertEqual(str(results2), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [5]),
                ('passes', [1]),
                ('type', [True]),
                ('min', [False]),
                ('max', [False]),
                ('sign', [False]),
                ('max_nulls', [False]),
                ('no_duplicates', [False]),
            )))
        vdf = pdc.PandasVerification.verification_to_dataframe(results2)
        self.assertTrue(vdf.equals(expected))

        pdcv2strict = pdc.PandasConstraintVerifier(df2, type_checking='strict')
        results2strict = verify(dsc2, list(df2), pdcv2strict.verifiers())
        # expect the boolean->real type constraint to fail with strict types
        expected = (
            'FIELDS:\n\n'
            'i: 6 failures  0 passes  '
            'type ✗  min ✗  max ✗  sign ✗  '
            'max_nulls ✗  no_duplicates ✗\n\n'
            'SUMMARY:\n\nConstraints passing: 0\nConstraints failing: 6')
        self.assertEqual(str(results2strict), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [6]),
                ('passes', [0]),
                ('type', [False]),
                ('min', [False]),
                ('max', [False]),
                ('sign', [False]),
                ('max_nulls', [False]),
                ('no_duplicates', [False]),
            )))
        vdf = pdc.PandasVerification.verification_to_dataframe(results2strict)
        self.assertTrue(vdf.equals(expected))

        ic3 = FieldConstraints('i', [TypeConstraint('int')])
        df3 = df1
        dfc3 = [ic3]
        dsc3 = DatasetConstraints(dfc3)
        pdcv3 = pdc.PandasConstraintVerifier(df3)
        results3 = verify(dsc3, list(df3), pdcv3.verifiers())
        expected = (
            'FIELDS:\n\n'
            'i: 0 failures  1 pass  type ✓\n\n'
            'SUMMARY:\n\nConstraints passing: 1\nConstraints failing: 0')
        self.assertEqual(str(results3), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [0]),
                ('passes', [1]),
                ('type', [True]),
            )))
        vdf = pdc.PandasVerification.verification_to_dataframe(results3)
        self.assertTrue(vdf.equals(expected))

        pdcv3 = pdc.PandasConstraintVerifier(df3)
        results3 = verify(dsc3, list(df3), pdcv3.verifiers(), ascii=True)
        expected = (
            'FIELDS:\n\n'
            'i: 0 failures  1 pass  type OK\n\n'
            'SUMMARY:\n\nConstraints passing: 1\nConstraints failing: 0')
        self.assertEqual(str(results3), expected)
Esempio n. 6
0
    def testFieldVerification(self):
        df1 = pd.DataFrame({
            'b': [True, False] * 2,
            'i':
            range(1, 5),
            'r': [float(x) for x in range(1, 5)],
            's': ['S%s' % x for x in range(1, 5)],
            'd': [datetime.datetime(2016, 1, x) for x in range(1, 5)]
        })
        ic1 = FieldConstraints('i', [
            TypeConstraint('int'),
            MinConstraint(0),
            MaxConstraint(10),
            SignConstraint('positive'),
            MaxNullsConstraint(0),
            NoDuplicatesConstraint()
        ])

        ic2 = FieldConstraints('i', [
            TypeConstraint('bool'),
            MinConstraint(2),
            MaxConstraint(3),
            SignConstraint('negative'),
            MaxNullsConstraint(0),
            NoDuplicatesConstraint()
        ])

        dfc1 = [ic1]
        dsc1 = DatasetConstraints(dfc1)
        pdcv1 = pdc.PandasConstraintVerifier(df1)
        results1 = verify(dsc1, pdcv1.verifiers())
        expected = ('FIELDS:\n\n'
                    'i: 0 failures  6 passes  '
                    'type ✓  min ✓  max ✓  sign ✓  '
                    'max_nulls ✓  no_duplicates ✓\n\n'
                    'SUMMARY:\n\nPasses: 6\nFailures: 0')
        self.assertEqual(str(results1), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [0]),
                ('passes', [6]),
                ('type', [True]),
                ('min', [True]),
                ('max', [True]),
                ('sign', [True]),
                ('max_nulls', [True]),
                ('no_duplicates', [True]),
            )))
        self.assertTrue(
            pdc.verification_to_dataframe(results1).equals(expected))

        df2 = pd.DataFrame({'i': [1, 2, 2, 6, np.nan]})
        dfc2 = [ic2]
        dsc2 = DatasetConstraints(dfc2)
        pdcv2 = pdc.PandasConstraintVerifier(df2)
        results2 = verify(dsc2, pdcv2.verifiers())
        expected = ('FIELDS:\n\n'
                    'i: 6 failures  0 passes  '
                    'type ✗  min ✗  max ✗  sign ✗  '
                    'max_nulls ✗  no_duplicates ✗\n\n'
                    'SUMMARY:\n\nPasses: 0\nFailures: 6')
        self.assertEqual(str(results2), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [6]),
                ('passes', [0]),
                ('type', [False]),
                ('min', [False]),
                ('max', [False]),
                ('sign', [False]),
                ('max_nulls', [False]),
                ('no_duplicates', [False]),
            )))
        self.assertTrue(
            pdc.verification_to_dataframe(results2).equals(expected))

        ic3 = FieldConstraints('i', [TypeConstraint('int')])
        df3 = df1
        dfc3 = [ic3]
        dsc3 = DatasetConstraints(dfc3)
        pdcv3 = pdc.PandasConstraintVerifier(df3)
        results3 = verify(dsc3, pdcv3.verifiers())
        expected = ('FIELDS:\n\n'
                    'i: 0 failures  1 pass  type ✓\n\n'
                    'SUMMARY:\n\nPasses: 1\nFailures: 0')
        self.assertEqual(str(results3), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [0]),
                ('passes', [1]),
                ('type', [True]),
            )))
        self.assertTrue(
            pdc.verification_to_dataframe(results3).equals(expected))