def vectorize_categorical_columns(X, constraints): """ vectorize columns by first converting them to categorical and then to one hot encoding. Then it removes the original categorical columns for the output. :param X: data :param constraints: json file with TDDA constraints :return: data with one hot columns """ cons = DatasetConstraints(loadpath=constraints) n_cat_cols = 0 n_cats = 0 initial_shape = X.shape[1] for key, value in cons.to_dict()['fields'].items(): if value['type'] == 'string': if len(value['allowed_values']) < 20: # for checking n_cat_cols += 1 n_cats += len(value['allowed_values']) X[key] = pd.Categorical(X[key], categories=value['allowed_values']) X = X.join(pd.get_dummies(X[key], prefix=key)) X = X.drop(key, axis=1) expected_len = initial_shape + n_cats - n_cat_cols actual_len = X.shape[1] if actual_len != expected_len: raise ValueError( 'Expected shape mismatch after vectorizing: {} != {}'.format( expected_len, actual_len)) return X
def testload(self): path = os.path.join(TESTDATA_DIR, 'ddd.tdda') constraints = DatasetConstraints(loadpath=path) constraints.sort_fields() actual = constraints.to_json() with open(path) as f: expected = json.dumps(sort_constraint_dict(json.loads(f.read())), indent=4) + '\n' self.assertEqual(actual, expected)
def get_columns_format_violations(attribute_id, column_values): attribute_record = Attribute.objects.get(id=attribute_id) constraint_dict = json.loads(attribute_record.format_specification) if 'allowed_values' in constraint_dict['fields']['column'].keys(): constraint_dict['fields']['column']['allowed_values'] = json.loads( constraint_dict['fields']['column']['allowed_values']) df = pd.DataFrame({'column': column_values}) if constraint_dict['fields']['column']['type'] == 'int': # if there's one None value in the column, then pandas will convert the whole column to np.float64 instead of np.int64, which causes problems df = df[df['column'].notnull()] df = df.astype('int64') if constraint_dict['fields']['column']['type'] == 'real': # javascript only has the datatype 'numeric' -> floating point numbers might look just like integers in the json df[df['column'].apply(lambda x: type(x) == int)] = df[ df['column'].apply(lambda x: type(x) == int)].astype('float64') pdv = PandasConstraintVerifier(df, epsilon=None, type_checking=None) print( '&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&' ) print(constraint_dict) print( '&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&' ) constraints = DatasetConstraints() constraints.initialize_from_dict(constraint_dict) pdv.repair_field_types(constraints) detection = pdv.detect(constraints, VerificationClass=PandasDetection, outpath=None, write_all=False, per_constraint=False, output_fields=None, index=False, in_place=False, rownumber_is_index=True, boolean_ints=False, report='records') violation_df = detection.detected() if violation_df is None: return [] else: violating_rows = [ int(row_nb) for row_nb in list(violation_df.index.values) ] return violating_rows
def testDetectDuplicates(self): iconstraints = FieldConstraints('i', [NoDuplicatesConstraint()]) sconstraints = FieldConstraints('s', [NoDuplicatesConstraint()]) constraints = DatasetConstraints([iconstraints, sconstraints]) df1 = pd.DataFrame({ 'i': [1, 2, 3, 4, np.nan], 's': ['one', 'two', 'three', 'four', np.nan] }) verifier1 = pdc.PandasConstraintVerifier(df1) v1 = verifier1.detect(constraints, VerificationClass=pdc.PandasDetection) self.assertEqual(v1.passes, 2) self.assertEqual(v1.failures, 0) ddf1 = v1.detected() self.assertIsNone(ddf1) df2 = pd.DataFrame({ 'i': [1, 2, 3, 2, np.nan], 's': ['one', 'two', 'three', 'two', np.nan] }) verifier2 = pdc.PandasConstraintVerifier(df2) v2 = verifier2.detect(constraints, VerificationClass=pdc.PandasDetection, per_constraint=True, output_fields=['i', 's']) self.assertEqual(v2.passes, 0) self.assertEqual(v2.failures, 2) ddf2 = v2.detected() self.assertStringCorrect(ddf2.to_string(), 'detect_dups.df')
def testload(self): path = os.path.join(TESTDATA_DIR, 'ddd.tdda') constraints = DatasetConstraints(loadpath=path) fields = ['index', 'evennulls', 'oddnulls', 'evens', 'odds', 'evenreals', 'oddreals', 'evenstr', 'oddstr', 'elevens', 'greek', 'binnedindex', 'binnedodds', 'basedate', 'evendates'] constraints.sort_fields(fields) self.assertStringCorrect(constraints.to_json(), 'ddd.tdda', rstrip=True, ignore_substrings=['"as_at":', '"local_time":', '"utc_time":', '"creator":', '"host":', '"user":'******'"tddafile":'])
def testload(self): path = os.path.join(TESTDATA_DIR, 'ddd.tdda') constraints = DatasetConstraints(loadpath=path) fields = [ 'index', 'evennulls', 'oddnulls', 'evens', 'odds', 'evenreals', 'oddreals', 'evenstr', 'oddstr', 'elevens', 'greek', 'binnedindex', 'binnedodds', 'basedate', 'evendates' ] constraints.sort_fields(fields) self.assertStringCorrect(constraints.to_json(), 'ddd.tdda', rstrip=True, ignore_substrings=[ '"as_at":', '"local_time":', '"utc_time":', '"creator":', '"host":', '"user":'******'"tddafile":' ])
def discover(self): field_constraints = [] for col in self.get_column_names(): constraints = self.discover_field_constraints(col) if constraints: field_constraints.append(constraints) if field_constraints: return DatasetConstraints(field_constraints) else: return None
def detect_df(df, constraints_path, epsilon=None, type_checking=None, outpath=None, write_all=False, per_constraint=False, output_fields=None, index=False, in_place=False, rownumber_is_index=True, boolean_ints=False, repair=True, report='records', **kwargs): """ Check the records from the Pandas DataFrame provided, to detect records that fail any of the constraints in the JSON ``.tdda`` file provided. This is anomaly detection. Mandatory Inputs: *df*: A Pandas DataFrame, to be checked. *constraints_path*: The path to a JSON ``.tdda`` file (possibly generated by the discover_df function, below) containing constraints to be checked. Or, alternatively, an in-memory dictionary containing the structured contents of a ``.tdda`` file. Optional Inputs: *epsilon*: When checking minimum and maximum values for numeric fields, this provides a tolerance. The tolerance is a proportion of the constraint value by which the constraint can be exceeded without causing a constraint violation to be issued. For example, with epsilon set to 0.01 (i.e. 1%), values can be up to 1% larger than a max constraint without generating constraint failure, and minimum values can be up to 1% smaller that the minimum constraint value without generating a constraint failure. (These are modified, as appropriate, for negative values.) If not specified, an *epsilon* of 0 is used, so there is no tolerance. NOTE: A consequence of the fact that these are proportionate is that min/max values of zero do not have any tolerance, i.e. the wrong sign always generates a failure. *type_checking*: ``strict`` or ``sloppy``. Because Pandas silently, routinely and automatically "promotes" integer and boolean columns to reals and objects respectively if they contain nulls, strict type checking can be problematical in Pandas. For this reason, ``type_checking`` defaults to ``sloppy``, meaning that type changes that could plausibly be attributed to Pandas type promotion will not generate constraint values. If this is set to strict, a Pandas ``float`` column ``c`` will only be allowed to satisfy a an ``int`` type constraint if:: c.dropnulls().astype(int) == c.dropnulls() Similarly, Object fields will satisfy a ``bool`` constraint only if:: c.dropnulls().astype(bool) == c.dropnulls() *outpath*: This specifies that the verification process should detect records that violate any constraints, and write them out to this CSV (or feather) file. By default, only failing records are written out to file, but this can be overridden with the ``write_all`` parameter. By default, the columns in the detection output file will be a boolean ``ok`` field for each constraint on each field, an and ``n_failures`` field containing the total number of constraints that failed for each row. This behavious can be overridden with the ``per_constraint``, ``output_fields`` and ``index`` parameters. *write_all*: Include passing records in the detection output file when detecting. *per_constraint*: Write one column per failing constraint, as well as the ``n_failures`` total. *output_fields*: Specify original columns to write out when detecting. If passed in as an empty list (rather than None), all original columns will be included. *index*: Boolean to specify whether to include a row-number index in the output file when detecting. This is automatically enabled if no output field names are specified. Rows are numbered from 0. *in_place*: Detect failing constraints by adding columns to the input DataFrame. If ``outpath`` is also specified, then failing records will also be written to file. *rownumber_is_index*: ``False`` if the DataFrame originated from a CSV file (and therefore any detection output file should refer to row numbers from the file, rather than items from the DataFrame index). *boolean_ints*: If ``True``, write out all boolean values to CSV file as integers (1 for true, and 0 for false), rather than as ``true`` and ``false`` values. *repair*: A boolean to specify whether to try to use the information in the constraints to attempt to repair potentially-incorrect type inferrences made when constructing the dataframe. When the dataframe has been loaded from a .csv file, this can often be useful (but should not be used with dataframes that have come from a more reliable source). The *report* parameter from :py:func:`verify_df` can also be used, in which case a verification report will also be produced in addition to the detection results. Returns: :py:class:`~tdda.constraints.pd.constraints.PandasDetection` object. This object has a :py:meth:`~PandasDetection.detected()` method for obtaining the Pandas DataFrame containing the detection results. Example usage:: import pandas as pd from tdda.constraints import detect_df df = pd.DataFrame({'a': [0, 1, 2, 10, pd.np.NaN], 'b': ['one', 'one', 'two', 'three', pd.np.NaN]}) v = detect_df(df, 'example_constraints.tdda') detection_df = v.detected() print(detection_df.to_string()) """ pdv = PandasConstraintVerifier(df, epsilon=epsilon, type_checking=type_checking) if isinstance(constraints_path, dict): constraints = DatasetConstraints() constraints.initialize_from_dict(native_definite(constraints_path)) else: constraints = DatasetConstraints(loadpath=constraints_path) if repair: pdv.repair_field_types(constraints) return pdv.detect(constraints, VerificationClass=PandasDetection, outpath=outpath, write_all=write_all, per_constraint=per_constraint, output_fields=output_fields, index=index, in_place=in_place, rownumber_is_index=rownumber_is_index, boolean_ints=boolean_ints, report=report, **kwargs)
def verify_df(df, constraints_path, epsilon=None, type_checking=None, repair=True, report='all', **kwargs): """ Verify that (i.e. check whether) the Pandas DataFrame provided satisfies the constraints in the JSON ``.tdda`` file provided. Mandatory Inputs: *df*: A Pandas DataFrame, to be checked. *constraints_path*: The path to a JSON ``.tdda`` file (possibly generated by the discover_df function, below) containing constraints to be checked. Or, alternatively, an in-memory dictionary containing the structured contents of a ``.tdda`` file. Optional Inputs: *epsilon*: When checking minimum and maximum values for numeric fields, this provides a tolerance. The tolerance is a proportion of the constraint value by which the constraint can be exceeded without causing a constraint violation to be issued. For example, with epsilon set to 0.01 (i.e. 1%), values can be up to 1% larger than a max constraint without generating constraint failure, and minimum values can be up to 1% smaller that the minimum constraint value without generating a constraint failure. (These are modified, as appropriate, for negative values.) If not specified, an *epsilon* of 0 is used, so there is no tolerance. NOTE: A consequence of the fact that these are proportionate is that min/max values of zero do not have any tolerance, i.e. the wrong sign always generates a failure. *type_checking*: ``strict`` or ``sloppy``. Because Pandas silently, routinely and automatically "promotes" integer and boolean columns to reals and objects respectively if they contain nulls, strict type checking can be problematical in Pandas. For this reason, ``type_checking`` defaults to ``sloppy``, meaning that type changes that could plausibly be attributed to Pandas type promotion will not generate constraint values. If this is set to strict, a Pandas ``float`` column ``c`` will only be allowed to satisfy a an ``int`` type constraint if:: c.dropnulls().astype(int) == c.dropnulls() Similarly, Object fields will satisfy a ``bool`` constraint only if:: c.dropnulls().astype(bool) == c.dropnulls() *repair*: A boolean to specify whether to try to use the information in the constraints to attempt to repair potentially-incorrect type inferrences made when constructing the dataframe. When the dataframe has been loaded from a .csv file, this can often be useful (but should not be used with dataframes that have come from a more reliable source). *report*: ``all`` or ``fields``. This controls the behaviour of the :py:meth:`~tdda.constraints.pd.constraints.PandasVerification.__str__` method on the resulting :py:class:`~tdda.constraints.pd.constraints.PandasVerification` object (but not its content). The default is ``all``, which means that all fields are shown, together with the verification status of each constraint for that field. If report is set to ``fields``, only fields for which at least one constraint failed are shown. Returns: :py:class:`~tdda.constraints.pd.constraints.PandasVerification` object. This object has attributes: - *passes* --- Number of passing constriants - *failures* --- Number of failing constraints It also has a :py:meth:`~tdda.constraints.pd.constraints.PandasVerification.to_frame()` method for converting the results of the verification to a Pandas DataFrame, and a :py:meth:`~tdda.constraints.pd.constraints.PandasVerification.__str__` method to print both the detailed and summary results of the verification. Example usage:: import pandas as pd from tdda.constraints import verify_df df = pd.DataFrame({'a': [0, 1, 2, 10, pd.np.NaN], 'b': ['one', 'one', 'two', 'three', pd.np.NaN]}) v = verify_df(df, 'example_constraints.tdda') print('Constraints passing: %d\\n' % v.passes) print('Constraints failing: %d\\n' % v.failures) print(str(v)) print(v.to_frame()) See *simple_verification.py* in the :ref:`constraint_examples` for a slightly fuller example. """ pdv = PandasConstraintVerifier(df, epsilon=epsilon, type_checking=type_checking) if isinstance(constraints_path, dict): constraints = DatasetConstraints() constraints.initialize_from_dict(native_definite(constraints_path)) else: constraints = DatasetConstraints(loadpath=constraints_path) if repair: pdv.repair_field_types(constraints) return pdv.verify(constraints, VerificationClass=PandasVerification, report=report, **kwargs)
def discover_constraints(df): """ Automatically discover potentially useful constraints that characterize the Pandas DataFrame provided. Input: *df*: any Pandas DataFrame. Possible return values: - :py:class:`~tdda.constraints.base.DatasetConstraints` object - ``None`` --- (if no constraints were found). This function goes through each column in the DataFrame and, where appropriate, generates constraints that describe (and are satisified by) this dataframe. Assuming it generates at least one constraint for at least one field it returns a :py:class:`tdda.constraints.base.DatasetConstraints` object. This includes a 'fields' attribute, keyed on the column name. The returned :py:class:`~tdda.constraints.base.DatasetConstraints` object includes a :py:meth:`~tdda.constraints.base.DatasetContraints.to_json` method, which converts the constraints into JSON for saving as a tdda constraints file. By convention, such JSON files use a '.tdda' extension. The JSON constraints file can be used to check whether other datasets also satisfy the constraints. The kinds of constraints (potentially) generated for each field (column) are: *type*: the (coarse, TDDA) type of the field. One of 'bool', 'int', 'real', 'string' or 'date'. *min*: for non-string fields, the minimum value in the column. Not generated for all-null columns. *max*: for non-string fields, the maximum value in the column. Not generated for all-null columns. *min_length*: For string fields, the length of the shortest string(s) in the field. N.B. In Python3, this is of course, a unicode string length; in Python2, it is an encoded string length, which may be less meaningful. *max_length*: For string fields, the length of the longest string(s) in the field. N.B. In Python3, this is of course, a unicode string length; in Python2, it is an encoded string length, which may be less meaningful. *sign*: If all the values in a numeric field have consistent sign, a sign constraint will be written with a value chosen from: - positive --- For all values *v* in field: `v > 0` - non-negative --- For all values *v* in field: `v >= 0` - zero --- For all values *v* in field: `v == 0` - non-positive --- For all values *v* in field: `v <= 0` - negative --- For all values *v* in field: `v < 0` - null --- For all values *v* in field: `v is null` *max_nulls*: The maximum number of nulls allowed in the field. - If the field has no nulls, a constraint will be written with max_nulls set to zero. - If the field has a single null, a constraint will be written with max_nulls set to one. - If the field has more than 1 null, no constraint will be generated. *no_duplicates*: For string fields (only, for now), if every non-null value in the field is different, this constraint will be generated (with value ``True``); otherwise no constraint will be generated. So this constraint indicates that all the **non-null** values in a string field are distinct (unique). *allowed_values*: For string fields only, if there are :py:const:`MAX_CATEGORIES` or fewer distinct string values in the dataframe, an AllowedValues constraint listing them will be generated. :py:const:`MAX_CATEGORIES` is currently "hard-wired" to 20. Example usage:: import pandas as pd from tdda.constraints.pdconstraints import discover_constraints df = pd.DataFrame({'a': [1, 2, 3], 'b': ['one', 'two', pd.np.NaN]}) constraints = discover_constraints(df) with open('example_constraints.tdda', 'w') as f: f.write(constraints.to_json()) See *simple_generation.py* in the :ref:`constraint_examples` for a slightly fuller example. """ field_constraints = [] for col in df: constraints = discover_field_constraints(df[col]) if constraints: field_constraints.append(constraints) if field_constraints: return DatasetConstraints(field_constraints) else: return None
def verify_df(df, constraints_path, epsilon=None, type_checking=None, **kwargs): """ Verify that (i.e. check whether) the Pandas DataFrame provided satisfies the constraints in the JSON .tdda file provided. Mandatory Inputs: *df*: A Pandas DataFrame, to be checked. *constraints_path*: The path to a JSON .tdda file (possibly generated by the discover_constraints function, below) containing constraints to be checked. Optional Inputs: *epsilon*: When checking minimum and maximum values for numeric fields, this provides a tolerance. The tolerance is a proportion of the constraint value by which the constraint can be exceeded without causing a constraint violation to be issued. With the default value of epsilon (:py:const:`EPSILON_DEFAULT` = 0.01, i.e. 1%), values can be up to 1% larger than a max constraint without generating constraint failure, and minimum values can be up to 1% smaller that the minimum constraint value without generating a constraint failure. (These are modified, as appropraite, for negative values.) NOTE: A consequence of the fact that these are proportionate is that min/max values of zero do not have any tolerance, i.e. the wrong sign always generates a failure. *type_checking*: 'strict' or 'sloppy'. Because Pandas silently, routinely and automatically "promotes" integer and boolean columns to reals and objects respectively if they contain nulls, strict type checking can be problematical in Pandas. For this reason, type_checking defaults to 'sloppy', meaning that type changes that could plausibly be attriuted to Pandas type promotion will not generate constraint values. If this is set to strict, a Pandas "float" column c will only be allowed to satisfy a an "int" type constraint if: `c.dropnulls().astype(int) == c.dropnulls()` Similarly, Object fields will satisfy a 'bool' constraint only if: `c.dropnulls().astype(bool) == c.dropnulls()` *report*: 'all' or 'fields'. This controls the behaviour of the :py:meth:`~PandasVerification.__str__` method on the resulting :py:class:`~PandasVerification` object (but not its content). The default is 'all', which means that all fields are shown, together with the verification status of each constraint for that field. If report is set to 'fields', only fields for which at least one constraint failed are shown. NOTE: The method also accepts two further parameters to control (not yet implemented) behaviour. 'constraints', will be used to indicate that only failing constraints for failing fields should be shown. 'one_per_line' will indicate that each constraint failure should be reported on a separate line. Returns: :py:class:`~PandasVerification` object. This object has attributes: - *passed* --- Number of passing constriants - *failures* --- Number of failing constraints It also has a :py:meth:`~PandasVerification.to_frame()` method for converting the results of the verification to a Pandas DataFrame, and a :py:meth:`~PandasVerification.__str__` method to print both the detailed and summary results of the verification. Example usage:: import pandas as pd from tdda.constraints.pdconstraints import verify_df df = pd.DataFrame({'a': [0, 1, 2, 10, pd.np.NaN], 'b': ['one', 'one', 'two', 'three', pd.np.NaN]}) v = verify_df(df, 'example_constraints.tdda') print('Passes:', v.passes) print('Failures: %d\\n' % v.failures) print(str(v)) print(v.to_frame()) See *simple_verification.py* in the :ref:`constraint_examples` for a slightly fuller example. """ pdv = PandasConstraintVerifier(df, epsilon=epsilon, type_checking=type_checking) constraints = DatasetConstraints(loadpath=constraints_path) return verify(constraints, pdv.verifiers(), VerificationClass=PandasVerification, **kwargs)
def testload(self): path = os.path.join(TESTDATA_DIR, 'ddd.tdda') constraints = DatasetConstraints(loadpath=path)
def testFieldVerification(self): df1 = pd.DataFrame({ 'b': [True, False] * 2, 'i': range(1, 5), 'r': [float(x) for x in range(1, 5)], 's': ['S%s' % x for x in range(1, 5)], 'd': [datetime.datetime(2016, 1, x) for x in range(1, 5)] }) ic1 = FieldConstraints('i', [ TypeConstraint('int'), MinConstraint(0), MaxConstraint(10), SignConstraint('positive'), MaxNullsConstraint(0), NoDuplicatesConstraint() ]) ic2 = FieldConstraints('i', [ TypeConstraint('bool'), MinConstraint(2), MaxConstraint(3), SignConstraint('negative'), MaxNullsConstraint(0), NoDuplicatesConstraint() ]) dfc1 = [ic1] dsc1 = DatasetConstraints(dfc1) pdcv1 = pdc.PandasConstraintVerifier(df1) results1 = verify(dsc1, list(df1), pdcv1.verifiers()) expected = ( 'FIELDS:\n\n' 'i: 0 failures 6 passes ' 'type ✓ min ✓ max ✓ sign ✓ ' 'max_nulls ✓ no_duplicates ✓\n\n' 'SUMMARY:\n\nConstraints passing: 6\nConstraints failing: 0') self.assertEqual(str(results1), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [0]), ('passes', [6]), ('type', [True]), ('min', [True]), ('max', [True]), ('sign', [True]), ('max_nulls', [True]), ('no_duplicates', [True]), ))) vdf = pdc.PandasVerification.verification_to_dataframe(results1) self.assertTrue(vdf.equals(expected)) df2 = pd.DataFrame({'i': [1, 2, 2, 6, np.nan]}) dfc2 = [ic2] dsc2 = DatasetConstraints(dfc2) pdcv2 = pdc.PandasConstraintVerifier(df2) results2 = verify(dsc2, list(df2), pdcv2.verifiers()) # expect the boolean->real type constraint to pass with sloppy types expected = ( 'FIELDS:\n\n' 'i: 5 failures 1 pass ' 'type ✓ min ✗ max ✗ sign ✗ ' 'max_nulls ✗ no_duplicates ✗\n\n' 'SUMMARY:\n\nConstraints passing: 1\nConstraints failing: 5') self.assertEqual(str(results2), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [5]), ('passes', [1]), ('type', [True]), ('min', [False]), ('max', [False]), ('sign', [False]), ('max_nulls', [False]), ('no_duplicates', [False]), ))) vdf = pdc.PandasVerification.verification_to_dataframe(results2) self.assertTrue(vdf.equals(expected)) pdcv2strict = pdc.PandasConstraintVerifier(df2, type_checking='strict') results2strict = verify(dsc2, list(df2), pdcv2strict.verifiers()) # expect the boolean->real type constraint to fail with strict types expected = ( 'FIELDS:\n\n' 'i: 6 failures 0 passes ' 'type ✗ min ✗ max ✗ sign ✗ ' 'max_nulls ✗ no_duplicates ✗\n\n' 'SUMMARY:\n\nConstraints passing: 0\nConstraints failing: 6') self.assertEqual(str(results2strict), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [6]), ('passes', [0]), ('type', [False]), ('min', [False]), ('max', [False]), ('sign', [False]), ('max_nulls', [False]), ('no_duplicates', [False]), ))) vdf = pdc.PandasVerification.verification_to_dataframe(results2strict) self.assertTrue(vdf.equals(expected)) ic3 = FieldConstraints('i', [TypeConstraint('int')]) df3 = df1 dfc3 = [ic3] dsc3 = DatasetConstraints(dfc3) pdcv3 = pdc.PandasConstraintVerifier(df3) results3 = verify(dsc3, list(df3), pdcv3.verifiers()) expected = ( 'FIELDS:\n\n' 'i: 0 failures 1 pass type ✓\n\n' 'SUMMARY:\n\nConstraints passing: 1\nConstraints failing: 0') self.assertEqual(str(results3), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [0]), ('passes', [1]), ('type', [True]), ))) vdf = pdc.PandasVerification.verification_to_dataframe(results3) self.assertTrue(vdf.equals(expected)) pdcv3 = pdc.PandasConstraintVerifier(df3) results3 = verify(dsc3, list(df3), pdcv3.verifiers(), ascii=True) expected = ( 'FIELDS:\n\n' 'i: 0 failures 1 pass type OK\n\n' 'SUMMARY:\n\nConstraints passing: 1\nConstraints failing: 0') self.assertEqual(str(results3), expected)
def testFieldVerification(self): df1 = pd.DataFrame({ 'b': [True, False] * 2, 'i': range(1, 5), 'r': [float(x) for x in range(1, 5)], 's': ['S%s' % x for x in range(1, 5)], 'd': [datetime.datetime(2016, 1, x) for x in range(1, 5)] }) ic1 = FieldConstraints('i', [ TypeConstraint('int'), MinConstraint(0), MaxConstraint(10), SignConstraint('positive'), MaxNullsConstraint(0), NoDuplicatesConstraint() ]) ic2 = FieldConstraints('i', [ TypeConstraint('bool'), MinConstraint(2), MaxConstraint(3), SignConstraint('negative'), MaxNullsConstraint(0), NoDuplicatesConstraint() ]) dfc1 = [ic1] dsc1 = DatasetConstraints(dfc1) pdcv1 = pdc.PandasConstraintVerifier(df1) results1 = verify(dsc1, pdcv1.verifiers()) expected = ('FIELDS:\n\n' 'i: 0 failures 6 passes ' 'type ✓ min ✓ max ✓ sign ✓ ' 'max_nulls ✓ no_duplicates ✓\n\n' 'SUMMARY:\n\nPasses: 6\nFailures: 0') self.assertEqual(str(results1), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [0]), ('passes', [6]), ('type', [True]), ('min', [True]), ('max', [True]), ('sign', [True]), ('max_nulls', [True]), ('no_duplicates', [True]), ))) self.assertTrue( pdc.verification_to_dataframe(results1).equals(expected)) df2 = pd.DataFrame({'i': [1, 2, 2, 6, np.nan]}) dfc2 = [ic2] dsc2 = DatasetConstraints(dfc2) pdcv2 = pdc.PandasConstraintVerifier(df2) results2 = verify(dsc2, pdcv2.verifiers()) expected = ('FIELDS:\n\n' 'i: 6 failures 0 passes ' 'type ✗ min ✗ max ✗ sign ✗ ' 'max_nulls ✗ no_duplicates ✗\n\n' 'SUMMARY:\n\nPasses: 0\nFailures: 6') self.assertEqual(str(results2), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [6]), ('passes', [0]), ('type', [False]), ('min', [False]), ('max', [False]), ('sign', [False]), ('max_nulls', [False]), ('no_duplicates', [False]), ))) self.assertTrue( pdc.verification_to_dataframe(results2).equals(expected)) ic3 = FieldConstraints('i', [TypeConstraint('int')]) df3 = df1 dfc3 = [ic3] dsc3 = DatasetConstraints(dfc3) pdcv3 = pdc.PandasConstraintVerifier(df3) results3 = verify(dsc3, pdcv3.verifiers()) expected = ('FIELDS:\n\n' 'i: 0 failures 1 pass type ✓\n\n' 'SUMMARY:\n\nPasses: 1\nFailures: 0') self.assertEqual(str(results3), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [0]), ('passes', [1]), ('type', [True]), ))) self.assertTrue( pdc.verification_to_dataframe(results3).equals(expected))
def verify_db_table(dbtype, db, tablename, constraints_path, epsilon=None, type_checking='strict', testing=False, report='all', **kwargs): """ Verify that (i.e. check whether) the database table provided satisfies the constraints in the JSON .tdda file provided. Mandatory Inputs: *dbtype*: Type of database. *db*: A database object *tablename*: A database table name, to be checked. *constraints_path*: The path to a JSON .tdda file (possibly generated by the discover_constraints function, below) containing constraints to be checked. Optional Inputs: *epsilon*: When checking minimum and maximum values for numeric fields, this provides a tolerance. The tolerance is a proportion of the constraint value by which the constraint can be exceeded without causing a constraint violation to be issued. For example, with epsilon set to 0.01 (i.e. 1%), values can be up to 1% larger than a max constraint without generating constraint failure, and minimum values can be up to 1% smaller that the minimum constraint value without generating a constraint failure. (These are modified, as appropriate, for negative values.) If not specified, an *epsilon* of 0 is used, so there is no tolerance. NOTE: A consequence of the fact that these are proportionate is that min/max values of zero do not have any tolerance, i.e. the wrong sign always generates a failure. *type_checking*: ``strict`` or ``sloppy``. For databases (unlike Pandas DataFrames), this defaults to 'strict'. If this is set to sloppy, a database "real" column c will only be allowed to satisfy a an "int" type constraint. *report*: ``all`` or ``fields``. This controls the behaviour of the :py:meth:`~~tdda.constraints.db.constraints.DatabaseVerification.__str__` method on the resulting :py:class:`~tdda.constraints.db.constraints.DatabaseVerification` object (but not its content). The default is ``all``, which means that all fields are shown, together with the verification status of each constraint for that field. If report is set to ``fields``, only fields for which at least one constraint failed are shown. *testing*: Boolean flag. Should only be set to ``True`` when being run as part of an automated test. It suppresses type-compatibility warnings. Returns: :py:class:`~tdda.constraints.db.constraints.DatabaseVerification` object. This object has attributes: - *passed* --- Number of passing constriants - *failures* --- Number of failing constraints Example usage:: import pgdb from tdda.constraints import verify_db_table dbspec = 'localhost:databasename:username:password' tablename = 'schemaname.tablename' db = pgdb.connect(dbspec) v = verify_db_table('postgres' db, tablename, 'myconstraints.tdda') print('Constraints passing:', v.passes) print('Constraints failing: %d\\n' % v.failures) print(str(v)) """ dbv = DatabaseConstraintVerifier(dbtype, db, tablename, epsilon=epsilon, type_checking=type_checking, testing=testing) if not dbv.check_table_exists(tablename): print('No table %s' % tablename, file=sys.stderr) sys.exit(1) constraints = DatasetConstraints(loadpath=constraints_path) return dbv.verify(constraints, VerificationClass=DatabaseVerification, report=report, **kwargs)
def verify_directory_from_file(path, constraints_path, **kwargs): fv = FilesConstraintVerifier(path, **kwargs) constraints = DatasetConstraints(loadpath=constraints_path) return fv.verify(constraints)