Esempio n. 1
0
def test_missing_value_integration():
    # we create two clks, one from PII which contains the 'replaceWith' values, one which contains the sentinels.
    # if everything goes right, then the two clks will be identical.
    schema_dict = dict(
        version=1,
        clkConfig=dict(l=1024,
                       k=20,
                       hash=dict(type='doubleHash'),
                       kdf=dict(type='HKDF')),
        features=[
            dict(identifier='name',
                 format=dict(type='string', encoding='utf-8'),
                 hashing=dict(ngram=2,
                              missingValue=dict(sentinel='null',
                                                replaceWith='Bob'))),
            dict(identifier='age',
                 format=dict(type='integer'),
                 hashing=dict(ngram=1,
                              missingValue=dict(sentinel='NA',
                                                replaceWith='42')))
        ])
    s = schema.from_json_dict(schema_dict)

    pii = [['Bob', '42'], ['null', 'NA']]

    clks = generate_clks(pii, schema=s, keys=('sec1', 'sec2'))
    assert len(clks) == 2
    assert clks[0] == clks[1]
def test_missing_value_integration():
    # we create two clks, one from PII which contains the 'replaceWith' values, one which contains the sentinels.
    # if everything goes right, then the two clks will be identical.

    schema_json = """
    {
      "version": 2,
      "clkConfig": {
        "l": 1024,
        "kdf": {
          "type": "HKDF"
        }
      },
      "features": [
        {
          "identifier": "name",
          "format": {
            "type": "string",
            "encoding": "utf-8"
          },
          "hashing": {
            "ngram": 2,
            "strategy": {
              "k": 20
            },
            "missingValue": {
              "sentinel": "null",
              "replaceWith": "Bob"
            }
          }
        },
        {
          "identifier": "age",
          "format": {
            "type": "integer"
          },
          "hashing": {
            "ngram": 1,
            "strategy": {
              "k": 20
            },
            "missingValue": {
              "sentinel": "NA",
              "replaceWith": "42"
            }
          }
        }
      ]
    }
    """
    schema_dict = json.loads(schema_json)
    s = schema.from_json_dict(schema_dict)

    pii = [['Bob', '42'], ['null', 'NA']]

    clks = generate_clks(pii, schema=s, keys=('sec1', 'sec2'))
    assert len(clks) == 2
    assert clks[0] == clks[1]
Esempio n. 3
0
    def test_validation_of_illdefined_not_ignored_feature(self):
        # 'ignored' has to be true if 'format' and 'hashing' is missing
        schema_dict = {
            'version': 2,
            'clkConfig': {
                'l': 1024,
                'kdf': {
                    'type': 'HKDF'
                }
            },
            'features': [{
                'identifier': 'rec_id',
                'ignored': False
            }]
        }
        with self.assertRaises(Exception) as contextmanager:
            schema.from_json_dict(schema_dict)

        exception = contextmanager.exception
        self.assertIsInstance(exception, SchemaError)
Esempio n. 4
0
    def test_expected_number_of_encodings_returned(self):
        loaded_schema = schema.from_json_dict(self.SCHEMA_DICT)

        results = clk.generate_clk_from_csv(io.StringIO(self.CSV_INPUT),
                                            self.SECRET,
                                            loaded_schema,
                                            validate=True,
                                            header=True,
                                            progress_bar=False)

        assert len(results) == 3
Esempio n. 5
0
    def test_encoding_regression(self):
        loaded_schema = schema.from_json_dict(self.SCHEMA_DICT)

        results = clk.generate_clk_from_csv(
            io.StringIO(self.CSV_INPUT),
            self.KEYS,
            loaded_schema,
            validate=True,
            header=True,
            progress_bar=False)

        assert results[0] == 'THHkzVWFYtzMJzmWobTLN8k8VwRN8+na10bN3N9I9oDPGuRZLGpV/QXZYtRZ6/wc+K3W9wvmDA2KpHmOTlVAY9jDblysQ9zlR86OMSbBn+uG3Qxi8EDpUN6nSI5FfOK1Zt77J0ye8P3wifF6QdkFfm3UXNGWil7CPNnUa/fHG0w='
        assert results[1] == '/r76/u//7+1O/3bG//7N5t3evpe/Wt7+v/f/Xt/+9rpXW//f/p7/v//3/vv7v/7/fv7X//vf3Vf/9vP//nd/3t93dt7/dPr/fj7f1z5B3/7W1u/qr+b3//q6729n6/au7772TPz+2s3u/n/88/9OTG/PxvrOh/7Hb89cz+Z3vmo='
Esempio n. 6
0
    def test_encoding_regression(self):
        loaded_schema = schema.from_json_dict(self.SCHEMA_DICT)

        results = clk.generate_clk_from_csv(io.StringIO(self.CSV_INPUT),
                                            self.SECRET,
                                            loaded_schema,
                                            validate=True,
                                            header=True,
                                            progress_bar=False)

        assert results[
            0] == 'SU9+/O/Jzzi0sfzH8K2l3+qfhn8Ky3jVI21DVdH9j2fXE++JH8GcQGSeYxDZFxALCAT8CHwYJyQcRT3MhUQOFWcOf5fWdr6ofh6DYy8iv////weyunbMahfV9RMWkRwQmBL3fjreUVOCS9D9kAbQC2XgULidKCTHd9ZpbPJ91eE='
        assert results[
            1] == 'Pfl1/d7/31/+9u9x9zv//76/83//0v1Xt/dX/3X/e79XP7vd+Xfkf//2/9Xb/7Fd73e9f/n0f/c7Vb99B/X29d8997Pz/vJ87X/X/vcX9vt1d+/+5bP1fvfevnfX8d/f/j0XPL7f999kc/28/3d4c7t/9b/+Pf411/f2+3z1d/s='
Esempio n. 7
0
    def test_issue_111(self):
        schema_dict = {
            'version':
            1,
            'clkConfig': {
                'l': 1024,
                'k': 20,
                'hash': {
                    'type': 'doubleHash'
                },
                'kdf': {
                    'type': 'HKDF'
                }
            },
            'features': [{
                'identifier': 'rec_id',
                'ignored': True
            }, {
                'identifier': 'given_name',
                'format': {
                    'type': 'string',
                    'encoding': 'utf-8'
                },
                'hashing': {
                    'ngram': 2,
                    'weight': 1
                }
            }, {
                'identifier': 'surname',
                'format': {
                    'type': 'string',
                    'encoding': 'utf-8'
                },
                'hashing': {
                    'ngram': 2,
                    'weight': 1
                }
            }, {
                'identifier': 'street_number',
                'format': {
                    'type': 'integer'
                },
                'hashing': {
                    'ngram': 1,
                    'positional': True,
                    'weight': 1
                }
            }, {
                'identifier': 'address_1',
                'format': {
                    'type': 'string',
                    'encoding': 'utf-8'
                },
                'hashing': {
                    'ngram': 2,
                    'weight': 1
                }
            }, {
                'identifier': 'address_2',
                'format': {
                    'type': 'string',
                    'encoding': 'utf-8'
                },
                'hashing': {
                    'ngram': 2,
                    'weight': 1
                }
            }, {
                'identifier': 'suburb',
                'format': {
                    'type': 'string',
                    'encoding': 'utf-8'
                },
                'hashing': {
                    'ngram': 2,
                    'weight': 1
                }
            }, {
                'identifier': 'postcode',
                'format': {
                    'type': 'integer',
                    'minimum': 1000,
                    'maximum': 9999
                },
                'hashing': {
                    'ngram': 1,
                    'positional': True,
                    'weight': 1
                }
            }, {
                'identifier': 'state',
                'format': {
                    'type': 'string',
                    'encoding': 'utf-8',
                    'maxLength': 3
                },
                'hashing': {
                    'ngram': 2,
                    'weight': 1
                }
            }, {
                'identifier': 'day_of_birth',
                'format': {
                    'type': 'integer'
                },
                'hashing': {
                    'ngram': 1,
                    'positional': True,
                    'weight': 1
                }
            }, {
                'identifier': 'soc_sec_id',
                'ignored': True
            }]
        }

        # This fails in #111. Now it shouldn't.
        schema.from_json_dict(schema_dict)
Esempio n. 8
0
class NameList:
    """ Randomly generated PII records.
    """

    randomname_schema_bytes = pkgutil.get_data('clkhash',
                                               'data/randomnames-schema.json')
    if randomname_schema_bytes is None:
        raise Exception(
            "Couldn't locate package data. Please file a bug report.")
    randomname_schema = json.loads(randomname_schema_bytes.decode())
    SCHEMA = schema.from_json_dict(randomname_schema)

    def __init__(self, n):
        # type: (int) -> None
        self.load_data()

        self.year = date.today().year - 1

        self.names = [person for person in self.generate_random_person(n)]

        self.all_male_first_names = None  # type: Optional[Distribution]
        self.all_female_first_names = None  # type: Optional[Distribution]
        self.all_last_names = None  # type: Optional[Distribution]
        self.all_ages = None  # type: Optional[Distribution]

    @property
    def schema_types(self):
        # type: () -> Sequence[FieldSpec]
        return self.SCHEMA.fields

    def generate_random_person(self, n):
        # type: (int) -> Iterable[Tuple[str, str, str, str]]
        """
        Generator that yields details on a person with plausible name, sex and age.

        :yields: Generated data for one person
            tuple - (id: str, name: str('First Last'), birthdate: str('DD/MM/YYYY'), sex: str('M' | 'F') )
        """
        assert self.all_male_first_names is not None
        assert self.all_female_first_names is not None
        assert self.all_last_names is not None
        for i in range(n):
            sex = 'M' if random.random() > 0.5 else 'F'
            dob = random_date(self.year, self.all_ages).strftime("%Y/%m/%d")
            first_name = self.all_male_first_names.generate(
            ) if sex == 'M' else self.all_female_first_names.generate()
            last_name = self.all_last_names.generate()

            yield (str(i), first_name + ' ' + last_name, dob, sex)

    def load_data(self):
        # type: () -> None
        """ Loads databases from package data

        Uses data files sourced from
        http://www.quietaffiliate.com/free-first-name-and-last-name-databases-csv-and-sql/
        https://www.census.gov/topics/population/genealogy/data/2010_surnames.html
        https://www.abs.gov.au/AUSSTATS/[email protected]/DetailsPage/3101.0Jun%202016
        """

        self.all_male_first_names = Distribution('data/male-first-names.csv')
        self.all_female_first_names = Distribution(
            'data/female-first-names.csv')
        self.all_last_names = Distribution('data/last-names.csv')
        self.all_ages = Distribution('data/ages.csv')

    def generate_subsets(self, sz, overlap=0.8, subsets=2):
        # type: (int, float, int) -> Tuple[List, ...]
        """ Return random subsets with nonempty intersection.

        The random subsets are of specified size. If an element is
        common to two subsets, then it is common to all subsets.
        This overlap is controlled by a parameter.

        :param sz: size of subsets to generate
        :param overlap: size of the intersection, as fraction of the
            subset length
        :param subsets: number of subsets to generate

        :raises ValueError: if there aren't sufficiently many names
            in the list to satisfy the request; more precisely,
            raises if (1 - subsets) * floor(overlap * sz) + subsets * sz > len(self.names).

        :return: tuple of subsets
        """
        overlap_sz = int(math.floor(overlap * sz))
        unique_sz = sz - overlap_sz  # Unique names per subset
        total_unique_sz = unique_sz * subsets  # Uniques in all subsets
        total_sz = overlap_sz + total_unique_sz

        if total_sz > len(self.names):
            msg = 'insufficient names for requested size and overlap'
            raise ValueError(msg)

        sset = random.sample(self.names, total_sz)

        # Overlapping subset, pool of unique names
        sset_overlap, sset_unique = sset[:overlap_sz], sset[overlap_sz:]
        assert len(sset_unique) == subsets * unique_sz

        # Split pool of unique names into `subsets` chunks
        uniques = (sset_unique[p * unique_sz:(p + 1) * unique_sz]
                   for p in range(subsets))

        return tuple(sset_overlap + u for u in uniques)