Example #1
0
 def test_different_weights(self):
     schema = Schema(
         version=1,
         hashing_globals=GlobalHashingProperties(
             k=30,
             kdf_hash='SHA256',
             kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='),
             kdf_key_size=64,
             kdf_salt=base64.b64decode(
                 'SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA=='
             ),
             kdf_type='HKDF',
             l=1024,
             hash_type='blakeHash',
             xor_folds=0,
         ),
         fields=[
             StringSpec(
                 identifier='some info',
                 hashing_properties=FieldHashingProperties(
                     encoding=FieldHashingProperties._DEFAULT_ENCODING,
                     ngram=2,
                     positional=False,
                     weight=1),
                 description=None,
                 case=StringSpec._DEFAULT_CASE,
                 min_length=0,
                 max_length=None)
         ])
Example #2
0
    def test_compare_to_legacy(self):
        # Identifier: 'ANY freetext'

        fhp = FieldHashingProperties(ngram=2, hash_type='doubleHash', k=10)

        schema = Schema(
            l=1024,
            kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='),
            kdf_key_size=64,
            kdf_salt=base64.b64decode(
                'SCbL2zHNnmsckfzchsNkZY9XoHk96P'
                '/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA=='),
            fields=[
                StringSpec(identifier='ANY text {}'.format(i + 1),
                           hashing_properties=fhp) for i in range(4)
            ])

        row = ['Bobby', 'Bobby', 'Bobby', 'Bobby']
        master_secrets = [
            'No, I am your father'.encode(),
            "No... that's not true! That's impossible!".encode()
        ]
        keys_hkdf = generate_key_lists(master_secrets, len(row), kdf='HKDF')
        keys_legacy = generate_key_lists(master_secrets,
                                         len(row),
                                         kdf='legacy')
        bloom_hkdf = next(stream_bloom_filters([row], keys_hkdf, schema))
        bloom_legacy = next(stream_bloom_filters([row], keys_legacy, schema))
        hkdf_count = bloom_hkdf[0].count()
        legacy_count = bloom_legacy[0].count()
        # lecay will map the 4 Bobbys' to the same bits, whereas hkdf will
        # map each Bobby to different bits.
        self.assertLessEqual(legacy_count, fhp.k * 6)  # 6 bi-grams
        self.assertLess(legacy_count, hkdf_count)
        self.assertLessEqual(hkdf_count, len(row) * legacy_count)
Example #3
0
 def test_different_weights(self):
     schema = Schema(
         l=1024,
         xor_folds=0,
         kdf_hash='SHA256',
         kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='),
         kdf_key_size=64,
         kdf_salt=base64.b64decode(
             'SCbL2zHNnmsckfzchsNkZY9XoHk96P'
             '/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA=='),
         kdf_type='HKDF',
         fields=[
             StringSpec(
                 identifier='some info',
                 hashing_properties=FieldHashingProperties(
                     encoding=FieldHashingProperties._DEFAULT_ENCODING,
                     comparator=bigram_tokenizer,
                     strategy=BitsPerTokenStrategy(20)
                 ),
                 description=None,
                 case=StringSpec._DEFAULT_CASE,
                 min_length=0,
                 max_length=None
             )
         ]
     )
def test_missing_value_integration():
    # we create two clks, one from PII which contains the 'replaceWith' values, one which contains the sentinels.
    # if everything goes right, then the two clks will be identical.
    schema_dict = dict(
        version=1,
        clkConfig=dict(l=1024,
                       k=20,
                       hash=dict(type='doubleHash'),
                       kdf=dict(type='HKDF')),
        features=[
            dict(identifier='name',
                 format=dict(type='string', encoding='utf-8'),
                 hashing=dict(ngram=2,
                              missingValue=dict(sentinel='null',
                                                replaceWith='Bob'))),
            dict(identifier='age',
                 format=dict(type='integer'),
                 hashing=dict(ngram=1,
                              missingValue=dict(sentinel='NA',
                                                replaceWith='42')))
        ])
    schema = Schema.from_json_dict(schema_dict)

    pii = [['Bob', '42'], ['null', 'NA']]

    clks = generate_clks(pii,
                         schema=schema,
                         keys=('sec1', 'sec2'),
                         validate=True,
                         callback=None)
    assert len(clks) == 2
    assert clks[0] == clks[1]
Example #5
0
 def mkSchema(hashing_properties):
     return Schema(l=1024,
                   xor_folds=1,
                   kdf_type='HKDF',
                   kdf_hash='SHA256',
                   kdf_salt=base64.b64decode(
                       'SCbL2zHNnmsckfzchsNkZY9XoHk96P'
                       '/G5nUBrM7ybymlEFsMV6PAeDZCNp3r'
                       'fNUPCtLDMOGQHG4pCQpfhiHCyA=='),
                   kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='),
                   kdf_key_size=64,
                   fields=[
                       StringSpec(identifier='name',
                                  hashing_properties=hashing_properties,
                                  description=None,
                                  case=StringSpec._DEFAULT_CASE,
                                  min_length=1,
                                  max_length=50)
                   ])
Example #6
0
    def test_xor_folding_integration(self):
        namelist = randomnames.NameList(1)
        schema_0 = namelist.SCHEMA
        assert schema_0.hashing_globals.xor_folds == 0

        schema_1 = Schema(version=schema_0.version,
                          hashing_globals=copy(schema_0.hashing_globals),
                          fields=schema_0.fields)
        schema_1.hashing_globals.xor_folds = 1
        schema_1.hashing_globals.l //= 2

        key_lists = generate_key_lists(('secret', 'sshh'),
                                       len(namelist.schema_types))
        bf_original, _, _ = next(
            bloomfilter.stream_bloom_filters(namelist.names, key_lists,
                                             schema_0))
        bf_folded, _, _ = next(
            bloomfilter.stream_bloom_filters(namelist.names, key_lists,
                                             schema_1))

        self.assertEqual(
            bf_folded, bf_original[:len(bf_original) // 2]
            ^ bf_original[len(bf_original) // 2:],
            'Folded filter is not an XOR of the two halves of the original.')
Example #7
0
fields = [
    Ignore('rec_id'),
    StringSpec('given_name', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(200))),
    StringSpec('surname', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(200))),
    IntegerSpec('street_number', FieldHashingProperties(comparator=NgramComparison(1, True), strategy=BitsPerFeatureStrategy(100), missing_value=MissingValueSpec(sentinel=''))),
    StringSpec('address_1', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(100))),
    StringSpec('address_2', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(100))),
    StringSpec('suburb', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(100))),
    IntegerSpec('postcode', FieldHashingProperties(comparator=NgramComparison(1, True), strategy=BitsPerFeatureStrategy(100))),
    StringSpec('state', FieldHashingProperties(comparator=NgramComparison(2), strategy=BitsPerFeatureStrategy(100))),
    IntegerSpec('date_of_birth', FieldHashingProperties(comparator=NgramComparison(1, True), strategy=BitsPerFeatureStrategy(200), missing_value=MissingValueSpec(sentinel=''))),
    Ignore('soc_sec_id')
]

secret = 'secret'
schema = Schema(fields, 1024)

a_csv.seek(0)
clks_a = generate_clk_from_csv(a_csv, secret, schema)

print(len(clks_a))

b_csv = io.StringIO()
dfB.to_csv(b_csv)
b_csv.seek(0)
clks_b = generate_clk_from_csv(b_csv, secret, schema)

print(len(clks_b))

# find matches
Example #8
0
 def test_compare_to_legacy(self):
     # Identifier: 'ANY freetext'
     schema = Schema(
         version=1,
         hashing_globals=GlobalHashingProperties(
             k=10,
             kdf_hash='SHA256',
             kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='),
             kdf_key_size=64,
             kdf_salt=base64.b64decode(
                 'SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA=='
             ),
             kdf_type='HKDF',
             l=1024,
             hash_type='doubleHash',
             hash_prevent_singularity=False,
             xor_folds=0),
         fields=[
             StringSpec(
                 identifier='ANY text 1',
                 hashing_properties=FieldHashingProperties(
                     encoding=FieldHashingProperties._DEFAULT_ENCODING,
                     ngram=2,
                     positional=False,
                     weight=1),
                 description=None,
                 case=StringSpec._DEFAULT_CASE,
                 min_length=0,
                 max_length=None),
             StringSpec(
                 identifier='ANY text 2',
                 hashing_properties=FieldHashingProperties(
                     encoding=FieldHashingProperties._DEFAULT_ENCODING,
                     ngram=2,
                     positional=False,
                     weight=1),
                 description=None,
                 case=StringSpec._DEFAULT_CASE,
                 min_length=0,
                 max_length=None),
             StringSpec(
                 identifier='ANY text 3',
                 hashing_properties=FieldHashingProperties(
                     encoding=FieldHashingProperties._DEFAULT_ENCODING,
                     ngram=2,
                     positional=False,
                     weight=1),
                 description=None,
                 case=StringSpec._DEFAULT_CASE,
                 min_length=0,
                 max_length=None),
             StringSpec(
                 identifier='ANY text 4',
                 hashing_properties=FieldHashingProperties(
                     encoding=FieldHashingProperties._DEFAULT_ENCODING,
                     ngram=2,
                     positional=False,
                     weight=1),
                 description=None,
                 case=StringSpec._DEFAULT_CASE,
                 min_length=0,
                 max_length=None)
         ])
Example #9
0
class NameList:
    """ Randomly generated PII records.
    """

    with open(
            os.path.join(os.path.dirname(__file__), 'data',
                         'randomnames-schema.json')) as f:
        SCHEMA = Schema.from_json_file(f)
    del f

    def __init__(self, n):
        # type: (int) -> None
        self.load_names()

        self.earliest_birthday = datetime(year=1916, month=1, day=1)
        self.latest_birthday = datetime(year=2016, month=1, day=1)

        self.names = [person for person in self.generate_random_person(n)]

        self.all_male_first_names = None  # type: Optional[Sequence[str]]
        self.all_female_first_names = None  # type: Optional[Sequence[str]]
        self.all_last_names = None  # type: Optional[Sequence[str]]

    @property
    def schema_types(self):
        # type: () -> Sequence[FieldSpec]
        return self.SCHEMA.fields

    def generate_random_person(self, n):
        # type: (int) -> Iterable[Tuple[str, str, str, str]]
        """
        Generator that yields details on a person with plausible name, sex and age.

        :yields: Generated data for one person
            tuple - (id: int, name: str('First Last'), birthdate: str('DD/MM/YYYY'), sex: str('M' | 'F') )
        """
        assert self.all_male_first_names is not None
        assert self.all_female_first_names is not None
        assert self.all_last_names is not None
        for i in range(n):
            sex = 'M' if random.random() > 0.5 else 'F'
            dob = random_date(self.earliest_birthday,
                              self.latest_birthday).strftime("%Y/%m/%d")
            first_name = random.choice(
                self.all_male_first_names) if sex == 'M' else random.choice(
                    self.all_female_first_names)
            last_name = random.choice(self.all_last_names)

            yield (str(i), first_name + ' ' + last_name, dob, sex)

    def load_names(self):
        # type: () -> None
        """ Loads a name database from package data

        Uses data files sourced from
        http://www.quietaffiliate.com/free-first-name-and-last-name-databases-csv-and-sql/
        """

        self.all_male_first_names = load_csv_data('male-first-names.csv')
        self.all_female_first_names = load_csv_data('female-first-names.csv')
        self.all_last_names = load_csv_data('CSV_Database_of_Last_Names.csv')

    def generate_subsets(self, sz, overlap=0.8, subsets=2):
        # type: (int, float, int) -> Tuple[List, ...]
        """ Return random subsets with nonempty intersection.

            The random subsets are of specified size. If an element is
            common to two subsets, then it is common to all subsets.
            This overlap is controlled by a parameter.

            :param sz: size of subsets to generate
            :param overlap: size of the intersection, as fraction of the
                subset length
            :param subsets: number of subsets to generate

            :raises ValueError: if there aren't sufficiently many names
                in the list to satisfy the request; more precisely,
                raises if (1 - subsets) * floor(overlap * sz)
                              + subsets * sz > len(self.names).

            :return: tuple of subsets
        """
        overlap_sz = int(math.floor(overlap * sz))
        unique_sz = sz - overlap_sz  # Unique names per subset
        total_unique_sz = unique_sz * subsets  # Uniques in all subsets
        total_sz = overlap_sz + total_unique_sz

        if total_sz > len(self.names):
            msg = 'insufficient names for requested size and overlap'
            raise ValueError(msg)

        sset = random.sample(self.names, total_sz)

        # Overlapping subset, pool of unique names
        sset_overlap, sset_unique = sset[:overlap_sz], sset[overlap_sz:]
        assert len(sset_unique) == subsets * unique_sz

        # Split pool of unique names into `subsets` chunks
        uniques = (sset_unique[p * unique_sz:(p + 1) * unique_sz]
                   for p in range(subsets))

        return tuple(sset_overlap + u for u in uniques)