def test_constraint_repr(self): self.assertEqual(repr(MinConstraint(7)), 'MinConstraint(value=7, precision=None)') self.assertEqual(repr(MinConstraint('a')), "MinConstraint(value='a', precision=None)") self.assertEqual(repr(MinConstraint('a', precision='closed')), "MinConstraint(value='a', precision='closed')") self.assertEqual(repr(MinLengthConstraint(3)), "MinLengthConstraint(value=3)") self.assertEqual(repr(MaxConstraint(-3)), 'MaxConstraint(value=-3, precision=None)') self.assertEqual(repr(MaxConstraint('KJ')), "MaxConstraint(value='KJ', precision=None)") self.assertEqual(repr(MaxConstraint(4.2, precision='closed')), "MaxConstraint(value=4.2, precision='closed')") self.assertEqual(repr(MaxLengthConstraint(0)), "MaxLengthConstraint(value=0)") self.assertEqual(repr(SignConstraint('positive')), "SignConstraint(value='positive')") self.assertEqual(repr(MaxNullsConstraint(0)), "MaxNullsConstraint(value=0)") self.assertEqual(repr(NoDuplicatesConstraint()), "NoDuplicatesConstraint(value=True)") self.assertEqual(repr(TypeConstraint('int')), "TypeConstraint(value='int')") self.assertEqual(repr(TypeConstraint(['int', 'real'])), "TypeConstraint(value=['int', 'real'])") self.assertEqual(repr(AllowedValuesConstraint(['a', 'b'])), "AllowedValuesConstraint(value=['a', 'b'])")
def test_verify_min_max_length_constraints(self): df = pd.DataFrame({ 'zero': [''] * 4, 'zeroOne': ['', 'a', '1', None], 'one': ['α', 'b', 'c', None], # Note unicode; min max len 1 'oneTwo': ['a', 'aa', 'bb', None], 'two': ['αα', 'αα', 'ββ', 'ββ'], # Note unicode; min max len 2 }) goods = [ ('zero', 0, 0), ('zero', 0, 10), ('zeroOne', 0, 1), ('zeroOne', 0, 5), ('one', 1, 1), ('one', 0, 1), ('one', 1, 4), ('one', 0, 10), ('oneTwo', 1, 2), ('oneTwo', 0, 2), ('oneTwo', 1, 4), ('oneTwo', 0, 10), ('two', 2, 2), ('two', 0, 2), ('two', 2, 8), ('two', 0, 7), ] bads = [ ('zero', 1, None), ('zeroOne', 2, 0), ('one', 2, 0), ('oneTwo', 3, 0), ('two', 3, 1), ] cvt = ConstraintVerificationTester(self, df) for (col, m, M) in goods: c = MinLengthConstraint(m) cvt.verify_min_length_constraint(col, c).isTrue() c = MaxLengthConstraint(M) cvt.verify_max_length_constraint(col, c).isTrue() for (col, m, M) in bads: c = MinLengthConstraint(m) cvt.verify_min_length_constraint(col, c).isFalse() if M is not None: c = MaxLengthConstraint(M) cvt.verify_max_length_constraint(col, c).isFalse()
def discover_field_constraints(self, fieldname): min_constraint = max_constraint = None min_length_constraint = max_length_constraint = None sign_constraint = no_duplicates_constraint = None max_nulls_constraint = allowed_values_constraint = None rex_constraint = None type_ = self.calc_tdda_type(fieldname) if type_ == 'other': return None # Unrecognized or complex else: type_constraint = TypeConstraint(type_) length = self.get_nrecords() if length > 0: # Things are not very interesting when there is no data nNull = self.calc_null_count(fieldname) nNonNull = self.calc_non_null_count(fieldname) assert nNull + nNonNull == length if nNull < 2: max_nulls_constraint = MaxNullsConstraint(nNull) # Useful info: uniqs = None n_unique = -1 # won't equal number of non-nulls later on if type_ in ('string', 'int'): n_unique = self.calc_nunique(fieldname) if type_ == 'string': if n_unique <= MAX_CATEGORIES: uniqs = self.calc_unique_values(fieldname, include_nulls=False) if uniqs: avc = AllowedValuesConstraint(uniqs) allowed_values_constraint = avc if nNonNull > 0: if type_ == 'string': # We don't generate a min, max or sign constraints for # strings. But we do generate min and max length # constraints if (uniqs is None and n_unique > 0): # There were too many for us to have bothered getting # them all before, but we need them now. uniqs = self.calc_unique_values(fieldname, include_nulls=False) if uniqs: if type(uniqs[0]) is unicode_string: L = [len(v) for v in uniqs] else: L = [len(v.decode('UTF-8')) for v in uniqs] m = min(L) M = max(L) min_length_constraint = MinLengthConstraint(m) max_length_constraint = MaxLengthConstraint(M) else: # Non-string fields all potentially get min and max values m = self.calc_min(fieldname) M = self.calc_max(fieldname) if not self.is_null(m): min_constraint = MinConstraint(m) if not self.is_null(M): max_constraint = MaxConstraint(M) # Non-date fields potentially get a sign constraint too. if min_constraint and max_constraint and type_ != 'date': if m == M == 0: sign_constraint = SignConstraint('zero') elif m >= 0: sign = 'positive' if m > 0 else 'non-negative' sign_constraint = SignConstraint(sign) elif M <= 0: sign = 'negative' if M < 0 else 'non-positive' sign_constraint = SignConstraint(sign) # else: # mixed elif self.is_null(m) and type_ != 'date': sign_constraint = SignConstraint('null') if n_unique == nNonNull and n_unique > 1 and type_ != 'real': no_duplicates_constraint = NoDuplicatesConstraint() if type_ == 'string' and self.inc_rex: rex_constraint = RexConstraint( self.find_rexes(fieldname, values=uniqs)) constraints = [ c for c in [ type_constraint, min_constraint, max_constraint, min_length_constraint, max_length_constraint, sign_constraint, max_nulls_constraint, no_duplicates_constraint, allowed_values_constraint, rex_constraint ] if c is not None ] return FieldConstraints(fieldname, constraints)
def discover_field_constraints(field): """ Discover constraints for a single field (column) from a Pandas DataFrame. Input: *field*: a single field (column; Series) object, usually from a Pandas DataFrame. Returns: - :py:class:`tdda.base.FieldConstraints` object, if any constraints were found. - ``None``, otherwise. """ min_constraint = max_constraint = None min_length_constraint = max_length_constraint = None sign_constraint = no_duplicates_constraint = None max_nulls_constraint = allowed_values_constraint = None type_ = tdda_type(field) if type_ == 'other': return None # Unrecognized or complex else: type_constraint = TypeConstraint(type_) length = len(field) if length > 0: # Things are not very interesting when there is no data nNull = int(field.isnull().sum().astype(int)) nNonNull = int(field.notnull().sum().astype(int)) assert nNull + nNonNull == length if nNull < 2: max_nulls_constraint = MaxNullsConstraint(nNull) # Useful info: uniqs = None n_unique = -1 # won't equal number of non-nulls later on if type_ in ('string', 'int'): n_unique = field.nunique() # excludes NaN if type_ == 'string': if n_unique <= MAX_CATEGORIES: uniqs = list(field.dropna().unique()) if uniqs: allowed_values_constraint = AllowedValuesConstraint(uniqs) if nNonNull > 0: if type_ == 'string': # We don't generate a min, max or sign constraints for strings # But we do generate min and max length constraints if (uniqs is None # There were too many for us to have and n_unique > 0): # bothered getting them all uniqs = list(field.dropna().unique()) # need them now if uniqs: m = min(len(v) for v in uniqs) M = max(len(v) for v in uniqs) min_length_constraint = MinLengthConstraint(m) max_length_constraint = MaxLengthConstraint(M) else: # Non-string fields all potentially get min and max values if type_ == 'date': m = field.min() M = field.max() if pd.notnull(m): m = m.to_pydatetime() if pd.notnull(M): M = M.to_pydatetime() else: m = field.min().item() M = field.max().item() if pd.notnull(m): min_constraint = MinConstraint(m) if pd.notnull(M): max_constraint = MaxConstraint(M) # Non-date fields potentially get a sign constraint too. if min_constraint and max_constraint and type_ != 'date': if m == M == 0: sign_constraint = SignConstraint('zero') elif m >= 0: sign = 'positive' if m > 0 else 'non-negative' sign_constraint = SignConstraint(sign) elif M <= 0: sign = 'negative' if M < 0 else 'non-positive' sign_constraint = SignConstraint(sign) # else: # mixed elif pd.isnull(m) and type_ != 'date': sign_constraint = SignConstraint('null') if n_unique == nNonNull and n_unique > 1 and type_ != 'real': no_duplicates_constraint = NoDuplicatesConstraint() constraints = [c for c in [type_constraint, min_constraint, max_constraint, min_length_constraint, max_length_constraint, sign_constraint, max_nulls_constraint, no_duplicates_constraint, allowed_values_constraint] if c is not None] return FieldConstraints(field.name, constraints)