Beispiel #1
0
 def test_constraint_repr(self):
     self.assertEqual(repr(MinConstraint(7)),
                      'MinConstraint(value=7, precision=None)')
     self.assertEqual(repr(MinConstraint('a')),
                      "MinConstraint(value='a', precision=None)")
     self.assertEqual(repr(MinConstraint('a', precision='closed')),
                      "MinConstraint(value='a', precision='closed')")
     self.assertEqual(repr(MinLengthConstraint(3)),
                      "MinLengthConstraint(value=3)")
     self.assertEqual(repr(MaxConstraint(-3)),
                      'MaxConstraint(value=-3, precision=None)')
     self.assertEqual(repr(MaxConstraint('KJ')),
                      "MaxConstraint(value='KJ', precision=None)")
     self.assertEqual(repr(MaxConstraint(4.2, precision='closed')),
                      "MaxConstraint(value=4.2, precision='closed')")
     self.assertEqual(repr(MaxLengthConstraint(0)),
                      "MaxLengthConstraint(value=0)")
     self.assertEqual(repr(SignConstraint('positive')),
                      "SignConstraint(value='positive')")
     self.assertEqual(repr(MaxNullsConstraint(0)),
                      "MaxNullsConstraint(value=0)")
     self.assertEqual(repr(NoDuplicatesConstraint()),
                      "NoDuplicatesConstraint(value=True)")
     self.assertEqual(repr(TypeConstraint('int')),
                      "TypeConstraint(value='int')")
     self.assertEqual(repr(TypeConstraint(['int', 'real'])),
                      "TypeConstraint(value=['int', 'real'])")
     self.assertEqual(repr(AllowedValuesConstraint(['a', 'b'])),
                      "AllowedValuesConstraint(value=['a', 'b'])")
Beispiel #2
0
 def test_verify_min_max_length_constraints(self):
     df = pd.DataFrame({
         'zero': [''] * 4,
         'zeroOne': ['', 'a', '1', None],
         'one': ['α', 'b', 'c', None],  # Note unicode; min max len 1
         'oneTwo': ['a', 'aa', 'bb', None],
         'two': ['αα', 'αα', 'ββ', 'ββ'],  # Note unicode; min max len 2
     })
     goods = [
         ('zero', 0, 0),
         ('zero', 0, 10),
         ('zeroOne', 0, 1),
         ('zeroOne', 0, 5),
         ('one', 1, 1),
         ('one', 0, 1),
         ('one', 1, 4),
         ('one', 0, 10),
         ('oneTwo', 1, 2),
         ('oneTwo', 0, 2),
         ('oneTwo', 1, 4),
         ('oneTwo', 0, 10),
         ('two', 2, 2),
         ('two', 0, 2),
         ('two', 2, 8),
         ('two', 0, 7),
     ]
     bads = [
         ('zero', 1, None),
         ('zeroOne', 2, 0),
         ('one', 2, 0),
         ('oneTwo', 3, 0),
         ('two', 3, 1),
     ]
     cvt = ConstraintVerificationTester(self, df)
     for (col, m, M) in goods:
         c = MinLengthConstraint(m)
         cvt.verify_min_length_constraint(col, c).isTrue()
         c = MaxLengthConstraint(M)
         cvt.verify_max_length_constraint(col, c).isTrue()
     for (col, m, M) in bads:
         c = MinLengthConstraint(m)
         cvt.verify_min_length_constraint(col, c).isFalse()
         if M is not None:
             c = MaxLengthConstraint(M)
             cvt.verify_max_length_constraint(col, c).isFalse()
Beispiel #3
0
    def discover_field_constraints(self, fieldname):
        min_constraint = max_constraint = None
        min_length_constraint = max_length_constraint = None
        sign_constraint = no_duplicates_constraint = None
        max_nulls_constraint = allowed_values_constraint = None
        rex_constraint = None

        type_ = self.calc_tdda_type(fieldname)
        if type_ == 'other':
            return None  # Unrecognized or complex
        else:
            type_constraint = TypeConstraint(type_)
        length = self.get_nrecords()

        if length > 0:  # Things are not very interesting when there is no data
            nNull = self.calc_null_count(fieldname)
            nNonNull = self.calc_non_null_count(fieldname)
            assert nNull + nNonNull == length
            if nNull < 2:
                max_nulls_constraint = MaxNullsConstraint(nNull)

            # Useful info:
            uniqs = None
            n_unique = -1  # won't equal number of non-nulls later on
            if type_ in ('string', 'int'):
                n_unique = self.calc_nunique(fieldname)
                if type_ == 'string':
                    if n_unique <= MAX_CATEGORIES:
                        uniqs = self.calc_unique_values(fieldname,
                                                        include_nulls=False)
                    if uniqs:
                        avc = AllowedValuesConstraint(uniqs)
                        allowed_values_constraint = avc

            if nNonNull > 0:
                if type_ == 'string':
                    # We don't generate a min, max or sign constraints for
                    # strings. But we do generate min and max length
                    # constraints
                    if (uniqs is None and n_unique > 0):
                        # There were too many for us to have bothered getting
                        # them all before, but we need them now.
                        uniqs = self.calc_unique_values(fieldname,
                                                        include_nulls=False)
                    if uniqs:
                        if type(uniqs[0]) is unicode_string:
                            L = [len(v) for v in uniqs]
                        else:
                            L = [len(v.decode('UTF-8')) for v in uniqs]
                        m = min(L)
                        M = max(L)
                        min_length_constraint = MinLengthConstraint(m)
                        max_length_constraint = MaxLengthConstraint(M)
                else:
                    # Non-string fields all potentially get min and max values
                    m = self.calc_min(fieldname)
                    M = self.calc_max(fieldname)
                    if not self.is_null(m):
                        min_constraint = MinConstraint(m)
                    if not self.is_null(M):
                        max_constraint = MaxConstraint(M)

                    # Non-date fields potentially get a sign constraint too.
                    if min_constraint and max_constraint and type_ != 'date':
                        if m == M == 0:
                            sign_constraint = SignConstraint('zero')
                        elif m >= 0:
                            sign = 'positive' if m > 0 else 'non-negative'
                            sign_constraint = SignConstraint(sign)
                        elif M <= 0:
                            sign = 'negative' if M < 0 else 'non-positive'
                            sign_constraint = SignConstraint(sign)
                        # else:
                        # mixed
                    elif self.is_null(m) and type_ != 'date':
                        sign_constraint = SignConstraint('null')

            if n_unique == nNonNull and n_unique > 1 and type_ != 'real':
                no_duplicates_constraint = NoDuplicatesConstraint()

        if type_ == 'string' and self.inc_rex:
            rex_constraint = RexConstraint(
                self.find_rexes(fieldname, values=uniqs))

        constraints = [
            c for c in [
                type_constraint, min_constraint, max_constraint,
                min_length_constraint, max_length_constraint, sign_constraint,
                max_nulls_constraint, no_duplicates_constraint,
                allowed_values_constraint, rex_constraint
            ] if c is not None
        ]
        return FieldConstraints(fieldname, constraints)
Beispiel #4
0
def discover_field_constraints(field):
    """
    Discover constraints for a single field (column) from a Pandas DataFrame.

    Input:

        *field*:
            a single field (column; Series) object, usually from
            a Pandas DataFrame.

    Returns:

        - :py:class:`tdda.base.FieldConstraints` object,
          if any constraints were found.
        - ``None``, otherwise.

    """
    min_constraint = max_constraint = None
    min_length_constraint = max_length_constraint = None
    sign_constraint = no_duplicates_constraint = None
    max_nulls_constraint = allowed_values_constraint = None

    type_ = tdda_type(field)
    if type_ == 'other':
        return None         # Unrecognized or complex
    else:
        type_constraint = TypeConstraint(type_)
    length = len(field)

    if length > 0:  # Things are not very interesting when there is no data
        nNull = int(field.isnull().sum().astype(int))
        nNonNull = int(field.notnull().sum().astype(int))
        assert nNull + nNonNull == length
        if nNull < 2:
            max_nulls_constraint = MaxNullsConstraint(nNull)

        # Useful info:
        uniqs = None
        n_unique = -1   # won't equal number of non-nulls later on
        if type_ in ('string', 'int'):
            n_unique = field.nunique()          # excludes NaN
            if type_ == 'string':
                if n_unique <= MAX_CATEGORIES:
                    uniqs = list(field.dropna().unique())
                if uniqs:
                    allowed_values_constraint = AllowedValuesConstraint(uniqs)

        if nNonNull > 0:
            if type_ == 'string':
                # We don't generate a min, max or sign constraints for strings
                # But we do generate min and max length constraints
                if (uniqs is None         # There were too many for us to have
                    and n_unique > 0):    # bothered getting them all
                    uniqs = list(field.dropna().unique())  # need them now
                if uniqs:
                    m = min(len(v) for v in uniqs)
                    M = max(len(v) for v in uniqs)
                    min_length_constraint = MinLengthConstraint(m)
                    max_length_constraint = MaxLengthConstraint(M)
            else:
                # Non-string fields all potentially get min and max values
                if type_ == 'date':
                    m = field.min()
                    M = field.max()
                    if pd.notnull(m):
                        m = m.to_pydatetime()
                    if pd.notnull(M):
                        M = M.to_pydatetime()
                else:
                    m = field.min().item()
                    M = field.max().item()
                if pd.notnull(m):
                    min_constraint = MinConstraint(m)
                if pd.notnull(M):
                    max_constraint = MaxConstraint(M)

                # Non-date fields potentially get a sign constraint too.
                if min_constraint and max_constraint and type_ != 'date':
                    if m == M == 0:
                        sign_constraint = SignConstraint('zero')
                    elif m >= 0:
                        sign = 'positive' if m > 0 else 'non-negative'
                        sign_constraint = SignConstraint(sign)
                    elif M <= 0:
                        sign = 'negative' if M < 0 else 'non-positive'
                        sign_constraint = SignConstraint(sign)
                    # else:
                        # mixed
                elif pd.isnull(m) and type_ != 'date':
                    sign_constraint = SignConstraint('null')

        if n_unique == nNonNull and n_unique > 1 and type_ != 'real':
            no_duplicates_constraint = NoDuplicatesConstraint()

    constraints = [c for c in [type_constraint,
                               min_constraint, max_constraint,
                               min_length_constraint, max_length_constraint,
                               sign_constraint, max_nulls_constraint,
                               no_duplicates_constraint,
                               allowed_values_constraint]
                     if c is not None]
    return FieldConstraints(field.name, constraints)