Esempio n. 1
0
    def test_string_nonregex_from_json_dict(self):
        spec_dict = dict(
            identifier='noRegex',
            format=dict(
                # note 'minLength' and 'maxLength' are missing.
                type='string',
                encoding='utf-8',
                description='bar'),
            hashing=dict(
                ngram=1,
                positional=True,
                weight=0))

        spec = field_formats.spec_from_json_dict(spec_dict)

        # The min and max lengths should be None.
        self.assertIsNone(spec.min_length)
        self.assertIsNone(spec.max_length)

        # There are no length limits so these should be fine.
        spec.validate('')
        spec.validate('doggo' * 10000)

        # Ok, let's put a 'minLength' and 'maxLength' in.
        spec_dict['format']['minLength'] = 5
        spec_dict['format']['maxLength'] = 8
        spec = field_formats.spec_from_json_dict(spec_dict)

        # These are not fine anymore.
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('dogs')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('doggodogs')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('doggo' * 10000)

        # These are ok though.
        spec.validate('doggo')
        spec.validate('doggos')
        spec.validate('doggies!')

        # This should be fine since we specified utf-8 as the encoding.
        spec.validate(u'doggøs')

        # Check random metadata.
        self.assertEqual(spec.identifier, 'noRegex')
        self.assertEqual(spec.description, 'bar')

        # Check the hashing specs.
        self.assertEqual(spec.hashing_properties.ngram, 1)
        self.assertIs(spec.hashing_properties.positional, True)
        self.assertEqual(spec.hashing_properties.weight, 0)

        # check with missing values
        spec_dict['hashing']['missingValue'] = dict(sentinel='N/A')
        spec = field_formats.spec_from_json_dict(spec_dict)
        # validating the sentinel should work
        spec.validate('N/A')
 def test_ignored(self):
     spec_dict = {
         'identifier': 'testingIgnored',
         'ignored': True}
     spec = field_formats.spec_from_json_dict(spec_dict)
     self.assertIsInstance(spec, field_formats.Ignore)
     self.assertEqual(spec.identifier, 'testingIgnored')
     spec_dict = {
         'identifier': 'testingIgnored',
         'ignored': False}
     with self.assertRaises(field_formats.InvalidSchemaError):
         field_formats.spec_from_json_dict(spec_dict)
     spec_dict = {
         'identifier': 'ignoredDates',
         'ignored': True,
         'format': {
             'type': 'date', 'format': '%Y-%m-%d'},
         'hashing': {'ngram': 0, 'strategy': {'k': 20}}
     }
     spec = field_formats.spec_from_json_dict(spec_dict)
     self.assertIsInstance(spec, field_formats.Ignore)
     self.assertEqual(spec.identifier, 'ignoredDates')
     spec_dict = {
         'identifier': 'notIgnoredDates',
         'ignored': False,
         'format': {
             'type': 'date', 'format': '%Y-%m-%d'},
         'hashing': {'ngram': 0, 'strategy': {'k': 20}}
     }
     spec = field_formats.spec_from_json_dict(spec_dict)
     self.assertIsNotNone(spec.hashing_properties)
     self.assertIsInstance(spec, field_formats.DateSpec)
     self.assertEqual(spec.identifier, 'notIgnoredDates')
Esempio n. 3
0
    def test_string_regex(self):
        regex_spec = dict(
            identifier='regex',
            format=dict(
                type='string',
                encoding='ascii',
                pattern=r'[5-9',  # This is syntactically incorrect.
                description='foo'),
            hashing=dict(
                ngram=1))

        # Make sure we don't accept bad regular expressions.
        with self.assertRaises(field_formats.InvalidSchemaError):
            field_formats.spec_from_json_dict(regex_spec)

        # Ok, let's fix it. This should not raise.
        regex_spec['format']['pattern'] = r'dog(.dog)*'
        spec = field_formats.spec_from_json_dict(regex_spec)

        # Ensure we accept these.
        spec.validate('dog')
        spec.validate('dogodog')

        # These don't match the pattern.
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('dogs')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('hot dog')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('hot dogs')

        # This should raise since 'ø' can't be represented by our
        # encoding (ASCII).
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate(u'dogødog')

        # Check random metadata.
        self.assertEqual(spec.identifier, 'regex')
        self.assertEqual(spec.description, 'foo')

        # Finally, check the hashing specs.
        self.assertEqual(spec.hashing_properties.ngram, 1)
        self.assertIs(spec.hashing_properties.positional, False)
        self.assertEqual(spec.hashing_properties.weight, 1)

        # check with missing values
        regex_spec['hashing']['missingValue'] = dict(sentinel='null')
        spec = field_formats.spec_from_json_dict(regex_spec)
        # validating the sentinel should work
        spec.validate('null')
        self.assertTrue(spec.is_missing_value('null'))
        self.assertFalse(spec.is_missing_value('dog'))
        self.assertEqual('null', spec.hashing_properties.replace_missing_value('null'))
        self.assertEqual('dog', spec.hashing_properties.replace_missing_value('dog'))
        # now with replaceWith value
        regex_spec['hashing']['missingValue']['replaceWith'] = 'cat'
        spec = field_formats.spec_from_json_dict(regex_spec)
        self.assertEqual('cat', spec.hashing_properties.replace_missing_value('null'))
Esempio n. 4
0
    def test_enum(self):
        spec_dict = dict(
            identifier='testingAllTheEnums',
            format=dict(
                type='enum',
                values=['dogs', 'cats', u'fërrets'],
                description='fizz'),
            hashing=dict(
                ngram=2,
                positional=False,
                weight=2.57))

        spec = field_formats.spec_from_json_dict(spec_dict)

        # These are fine.
        spec.validate('dogs')
        spec.validate('cats')
        spec.validate(u'fërrets')  # Test Unicode.

        # These are not.
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('mice')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('snakes')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('dogsdogs')

        # Check random metadata.
        self.assertEqual(spec.identifier, 'testingAllTheEnums')
        self.assertEqual(spec.description, 'fizz')

        # Check the hashing specs.
        self.assertEqual(spec.hashing_properties.ngram, 2)
        self.assertIs(spec.hashing_properties.positional, False)
        self.assertEqual(spec.hashing_properties.weight, 2.57)

        # check missing values
        spec_dict['hashing']['missingValue']=dict(sentinel='', replaceWith='omg')
        spec = field_formats.spec_from_json_dict(spec_dict)
        # that's the sentinel for missing values
        spec.validate('')
        # check the missing value related functions in spec
        self.assertTrue(spec.is_missing_value(''))
        self.assertFalse(spec.is_missing_value('no WAY'))
        self.assertEqual(spec.hashing_properties.missing_value.replace_with,
                         spec.hashing_properties.replace_missing_value(''))
    def test_enum(self):
        spec_dict = {
            'identifier': 'testingAllTheEnums',
            'format': {
                'type': 'enum',
                'values': ['dogs', 'cats', u'fërrets'],
                'description': 'fizz'}, 'hashing': {'ngram': 2, 'strategy': {'k': 20}}}

        spec = field_formats.spec_from_json_dict(spec_dict)

        # These are fine.
        spec.validate('dogs')
        spec.validate('cats')
        spec.validate(u'fërrets')  # Test Unicode.

        # These are not.
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('mice')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('snakes')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('dogsdogs')

        # Check random metadata.
        self.assertEqual(spec.identifier, 'testingAllTheEnums')
        self.assertEqual(spec.description, 'fizz')

        # Check the hashing specs.
        self.assertEqual(spec.hashing_properties.ngram, 2)
        self.assertIs(spec.hashing_properties.positional, False)
        self.assertEqual(spec.hashing_properties.k, 20)

        # check missing values
        spec_dict['hashing']['missingValue']=dict(sentinel='', replaceWith='omg')
        spec = field_formats.spec_from_json_dict(spec_dict)
        # that's the sentinel for missing values
        spec.validate('')
        # check the missing value related functions in spec
        self.assertTrue(spec.is_missing_value(''))
        self.assertFalse(spec.is_missing_value('no WAY'))
        self.assertEqual(spec.hashing_properties.missing_value.replace_with,
                         spec.hashing_properties.replace_missing_value(''))
Esempio n. 6
0
def from_json_dict(dct, validate=True):
    # type: (Dict[str, Any], bool) -> Schema
    """ Create a Schema of the most recent version according to dct

    if the provided schema dict is of an older version, then it will be automatically converted to the latest.

    :param dct: This dictionary must have a `'features'`
            key specifying the columns of the dataset. It must have
            a `'version'` key containing the master schema version
            that this schema conforms to. It must have a `'hash'`
            key with all the globals.
    :param validate: (default True) Raise an exception if the
            schema does not conform to the master schema.
    :raises SchemaError: An exception containing details about why
            the schema is not valid.
    :return: the Schema
    """
    if validate:
        # This raises iff the schema is invalid.
        validate_schema_dict(dct)
    dct = convert_to_latest_version(dct)
    if validate:
        validate_schema_dict(dct)
    clk_config = dct['clkConfig']
    l = clk_config['l']
    xor_folds = clk_config.get('xor_folds', 0)

    kdf = clk_config['kdf']
    kdf_type = kdf['type']
    kdf_hash = kdf.get('hash', 'SHA256')
    kdf_info_string = kdf.get('info')
    kdf_info = (base64.b64decode(kdf_info_string)
                if kdf_info_string is not None else None)
    kdf_salt_string = kdf.get('salt')
    kdf_salt = (base64.b64decode(kdf_salt_string)
                if kdf_salt_string is not None else None)
    kdf_key_size = kdf.get('keySize', DEFAULT_KDF_KEY_SIZE)

    # Try to parse each feature config and store any errors encountered
    # for reporting.
    feature_errors = []
    feature_configs = []

    for i, feature_config in enumerate(dct['features']):
        try:
            feature_configs.append(spec_from_json_dict(feature_config))
        except InvalidSchemaError as e:
            e.field_spec_index = i
            e.json_field_spec = feature_config
            feature_errors.append(e)

    if len(feature_errors):
        raise SchemaError("Schema was invalid", feature_errors)

    return Schema(feature_configs, l, xor_folds, kdf_type, kdf_hash, kdf_info,
                  kdf_salt, kdf_key_size)
Esempio n. 7
0
    def test_date_output_formatting(self):
        regex_spec = dict(
            identifier='dates',
            format=dict(
                type='date',
                format='%Y:%m-%d'),
            hashing=dict(ngram=0))

        spec = field_formats.spec_from_json_dict(regex_spec)
        from datetime import date
        from clkhash.field_formats import DateSpec
        d = date.today()
        assert spec.format_value(d.strftime(regex_spec['format']['format'])) == d.strftime(DateSpec.OUTPUT_FORMAT)
Esempio n. 8
0
    def test_string_default_encoding_nonregex(self):
        spec_dict = dict(
            identifier='stringWithoutEncoding',
            format=dict(type='string'),
            hashing=dict(
                ngram=1,
                positional=True,
                weight=0))

        spec = field_formats.spec_from_json_dict(spec_dict)

        # These are fine since the default encoding is utf-8.
        spec.validate('dogs')
        spec.validate('cats')
        spec.validate(u'fërrets')  # Test Unicode.

        self.assertEqual(spec.hashing_properties.encoding, 'utf-8')
Esempio n. 9
0
    def test_string_default_encoding_regex(self):
        spec_dict = dict(
            identifier='stringWithoutEncoding',
            format=dict(
                type='string',
                pattern='f.+'),
            hashing=dict(
                ngram=1,
                positional=True,
                weight=0))

        spec = field_formats.spec_from_json_dict(spec_dict)

        # These are fine since the default encoding is utf-8.
        spec.validate('fur')
        spec.validate(u'fërrets')  # Test Unicode.

        # These don't match the pattern.
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('cats')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('dogs')

        self.assertEqual(spec.hashing_properties.encoding, 'utf-8')
Esempio n. 10
0
    def test_date(self):
        regex_spec = dict(
            identifier='dates',
            format=dict(
                type='date',
                format='%Y-%m-%d',
                description='phoenix dactylifera'),
            hashing=dict(
                ngram=0,
                positional=False,
                weight=1))

        spec = field_formats.spec_from_json_dict(regex_spec)

        # These are valid dates.
        spec.validate('1946-06-14')
        spec.validate('1977-12-31')
        spec.validate('1981-10-30')
        spec.validate('2006-03-20')

        # These are less valid.
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('0000-03-20')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('2006-00-20')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('2006-13-20')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('2006-03-00')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('2006-03-52')

        # These formats are incorrect.
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('194-06-14')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('1946--06-14')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('194606-14')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('1946-06--14')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('1946-0614')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('2006-3-20d')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('d2006-3-20')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('asdfghjkl')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('20-03-2006')

        # These are valid dates.
        spec.validate('2017-12-31')
        spec.validate('2017-02-28')
        spec.validate('2017-03-30')
        spec.validate('2016-02-29')
        spec.validate('2000-02-29')

        # These are not.
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('2017-11-31')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('2017-02-29')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('2016-02-30')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('1900-02-29')

        # Check random metadata.
        self.assertEqual(spec.identifier, 'dates')
        self.assertEqual(spec.description, 'phoenix dactylifera')

        # Check the hashing specs.
        self.assertEqual(spec.hashing_properties.ngram, 0)
        self.assertIs(spec.hashing_properties.positional, False)
        self.assertEqual(spec.hashing_properties.weight, 1)

        # check for graceful fail if format spec is invalid
        spec.format = 'invalid%'
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('2018-01-23')
Esempio n. 11
0
    def test_integer(self):
        regex_spec = dict(
            identifier='Z',
            format=dict(
                # Missing 'minimum' and 'maximum'.
                type='integer',
                description='buzz'),
            hashing=dict(
                ngram=1,
                positional=True))

        spec = field_formats.spec_from_json_dict(regex_spec)

        # `minimum` and `maximum` should be None.
        self.assertIsNone(spec.minimum)
        self.assertIsNone(spec.maximum)

        # There are no bounds so these should be fine.
        spec.validate('-31')
        spec.validate('0')
        spec.validate('1')
        spec.validate('10')
        spec.validate(str(10 ** 321))

        # We don't like floats.
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate(str(math.pi))
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate(str(-math.pi))

        # There are several valid integer strings for one integer
        for int_str in ['  10', '10  ', '+10', ' +10 ']:
            spec.validate(int_str)
            self.assertEqual('10', spec.format_value(int_str))

        # Ok, let's put a 'minimum' and 'maximum' in.
        regex_spec['format']['minimum'] = 8
        regex_spec['format']['maximum'] = 12
        spec = field_formats.spec_from_json_dict(regex_spec)

        # These are too small, thus invalid.
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('-1')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate(str(-math.pi))
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('0')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('1')
        # too big, I assume
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate(str(10 ** 321))

        # These are still good.
        spec.validate('8')
        spec.validate('9')
        spec.validate('12')

        # Check random metadata.
        self.assertEqual(spec.identifier, 'Z')
        self.assertEqual(spec.description, 'buzz')

        # Check the hashing specs.
        self.assertEqual(spec.hashing_properties.ngram, 1)
        self.assertIs(spec.hashing_properties.positional, True)
        self.assertEqual(spec.hashing_properties.weight, 1)

        # check with missing values
        regex_spec['hashing']['missingValue'] = dict(sentinel='None', replaceWith='42')
        spec = field_formats.spec_from_json_dict(regex_spec)
        # validating the sentinel should work
        spec.validate('None')
        self.assertEqual('42', spec.hashing_properties.replace_missing_value('None'))
    def test_date(self):
        json_spec = {
            'identifier': 'dates',
            'format': {
                'type': 'date', 'format': '%Y-%m-%d',
                'description': 'phoenix dactylifera'},
            'hashing': {'ngram': 0, 'strategy': {'k': 20}}
        }

        spec = field_formats.spec_from_json_dict(json_spec)

        # These are valid dates.
        spec.validate('1946-06-14')
        spec.validate('1977-12-31')
        spec.validate('1981-10-30')
        spec.validate('2006-03-20')

        # These are less valid.
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('0000-03-20')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('2006-00-20')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('2006-13-20')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('2006-03-00')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('2006-03-52')

        # These formats are incorrect.
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('194-06-14')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('1946--06-14')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('194606-14')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('1946-06--14')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('1946-0614')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('2006-3-20d')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('d2006-3-20')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('asdfghjkl')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('20-03-2006')

        # These are valid dates.
        spec.validate('2017-12-31')
        spec.validate('2017-02-28')
        spec.validate('2017-03-30')
        spec.validate('2016-02-29')
        spec.validate('2000-02-29')

        # These are not.
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('2017-11-31')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('2017-02-29')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('2016-02-30')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('1900-02-29')

        # Check random metadata.
        self.assertEqual(spec.identifier, 'dates')
        self.assertEqual(spec.description, 'phoenix dactylifera')

        # Check the hashing specs.
        self.assertEqual(spec.hashing_properties.ngram, 0)
        self.assertIs(spec.hashing_properties.positional, False)
        self.assertEqual(spec.hashing_properties.k, 20)

        # check for graceful fail if format spec is invalid
        spec.format = 'invalid%'
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('2018-01-23')
    def test_integer(self):
        json_spec = {
            'identifier': 'Z',
            'format': {
                # Missing 'minimum' and 'maximum'.
                'type': 'integer',
                'description': 'buzz'
            },
            'hashing': {
                'ngram': 1,
                'strategy': {'k': 20},
                'positional': True
            }
        }

        spec = field_formats.spec_from_json_dict(json_spec)

        # `minimum` and `maximum` should be None.
        self.assertIsNone(spec.minimum)
        self.assertIsNone(spec.maximum)

        # There are no bounds so these should be fine.
        spec.validate('-31')
        spec.validate('0')
        spec.validate('1')
        spec.validate('10')
        spec.validate(str(10 ** 321))

        # We don't like floats.
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate(str(math.pi))
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate(str(-math.pi))
        # or strings
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('boom')
        with self.assertRaises(ValueError):
            spec.format_value('boom')

        # There are several valid integer strings for one integer
        for int_str in ['  10', '10  ', '+10', ' +10 ']:
            spec.validate(int_str)
            self.assertEqual('10', spec.format_value(int_str))

        # Ok, let's put a 'minimum' and 'maximum' in.

        json_spec['format']['minimum'] = 8
        json_spec['format']['maximum'] = 12
        spec = field_formats.spec_from_json_dict(json_spec)

        # These are too small, thus invalid.
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('-1')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate(str(-math.pi))
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('0')
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate('1')
        # too big, I assume
        with self.assertRaises(field_formats.InvalidEntryError):
            spec.validate(str(10 ** 321))

        # These are still good.
        spec.validate('8')
        spec.validate('9')
        spec.validate('12')

        # Check random metadata.
        self.assertEqual(spec.identifier, 'Z')
        self.assertEqual(spec.description, 'buzz')

        # Check the hashing specs.
        self.assertEqual(spec.hashing_properties.ngram, 1)
        self.assertIs(spec.hashing_properties.positional, True)
        self.assertEqual(spec.hashing_properties.k, 20)

        # check with missing values
        json_spec['hashing']['missingValue'] = dict(sentinel='None', replaceWith='42')
        spec = field_formats.spec_from_json_dict(json_spec)
        # validating the sentinel should work
        spec.validate('None')
        self.assertEqual('42', spec.hashing_properties.replace_missing_value('None'))