Ejemplo n.º 1
0
 def test_compare_v1_and_v2(self):
     pii = randomnames.NameList(100).names
     schema_v1 = randomnames.NameList.SCHEMA
     # this v2 schema should be equivalent to the above v1 schema
     schema_v2 = _test_schema('randomnames-schema-v2.json')
     keys = ('secret', 'sshh')
     for clkv1, clkv2 in zip(clk.generate_clks(pii, schema_v1, keys),
                             clk.generate_clks(pii, schema_v2, keys)):
         self.assertEqual(clkv1, clkv2)
Ejemplo n.º 2
0
 def test_compare_v1_v2_and_v3(self):
     pii = randomnames.NameList(100).names
     schema_v3 = randomnames.NameList.SCHEMA
     # this v2 schema should be equivalent to the above v3 schema
     schema_v2 = _test_schema('randomnames-schema-v2.json')
     schema_v1 = _test_schema('randomnames-schema-v1.json')
     secret = 'secret'
     for clkv1, clkv2, clkv3 in zip(clk.generate_clks(pii, schema_v1, secret),
                                    clk.generate_clks(pii, schema_v2, secret),
                                    clk.generate_clks(pii, schema_v3, secret)):
         self.assertEqual(clkv1, clkv2)
         self.assertEqual(clkv1, clkv3)
Ejemplo n.º 3
0
def _test_stats(pii, schema, keys):
    counts = [deserialize_bitarray(c).count() for c in
              clk.generate_clks(pii, schema, keys)]
    print('_test_stats: counts = ', counts)
    ov = OnlineMeanVariance()
    ov.update(counts)
    return ov.mean(), ov.std()
Ejemplo n.º 4
0
def test_missing_value_integration():
    # we create two clks, one from PII which contains the 'replaceWith' values, one which contains the sentinels.
    # if everything goes right, then the two clks will be identical.
    schema_dict = dict(
        version=1,
        clkConfig=dict(l=1024,
                       k=20,
                       hash=dict(type='doubleHash'),
                       kdf=dict(type='HKDF')),
        features=[
            dict(identifier='name',
                 format=dict(type='string', encoding='utf-8'),
                 hashing=dict(ngram=2,
                              missingValue=dict(sentinel='null',
                                                replaceWith='Bob'))),
            dict(identifier='age',
                 format=dict(type='integer'),
                 hashing=dict(ngram=1,
                              missingValue=dict(sentinel='NA',
                                                replaceWith='42')))
        ])
    s = schema.from_json_dict(schema_dict)

    pii = [['Bob', '42'], ['null', 'NA']]

    clks = generate_clks(pii, schema=s, keys=('sec1', 'sec2'))
    assert len(clks) == 2
    assert clks[0] == clks[1]
def test_missing_value_integration():
    # we create two clks, one from PII which contains the 'replaceWith' values, one which contains the sentinels.
    # if everything goes right, then the two clks will be identical.

    schema_json = """
    {
      "version": 2,
      "clkConfig": {
        "l": 1024,
        "kdf": {
          "type": "HKDF"
        }
      },
      "features": [
        {
          "identifier": "name",
          "format": {
            "type": "string",
            "encoding": "utf-8"
          },
          "hashing": {
            "ngram": 2,
            "strategy": {
              "k": 20
            },
            "missingValue": {
              "sentinel": "null",
              "replaceWith": "Bob"
            }
          }
        },
        {
          "identifier": "age",
          "format": {
            "type": "integer"
          },
          "hashing": {
            "ngram": 1,
            "strategy": {
              "k": 20
            },
            "missingValue": {
              "sentinel": "NA",
              "replaceWith": "42"
            }
          }
        }
      ]
    }
    """
    schema_dict = json.loads(schema_json)
    s = schema.from_json_dict(schema_dict)

    pii = [['Bob', '42'], ['null', 'NA']]

    clks = generate_clks(pii, schema=s, keys=('sec1', 'sec2'))
    assert len(clks) == 2
    assert clks[0] == clks[1]
Ejemplo n.º 6
0
    def test_describe(self):
        size = 1000
        pii_data = randomnames.NameList(size)

        clks = generate_clks(pii_data.names, pii_data.SCHEMA, 'secret', validate=True)
        json_clks = json.dumps({'clks': clks})

        plot(StringIO(json_clks))   # clkutil describe

        assert ' observations: {} '.format(size) in self.temp_std_out.getvalue()
Ejemplo n.º 7
0
    def test_namelist_hashable(self):
        namelist = randomnames.NameList(1000)
        s1, s2 = namelist.generate_subsets(100, 0.8)

        self.assertEqual(len(s1), 100)
        self.assertEqual(len(s2), 100)

        schema = randomnames.NameList.SCHEMA
        keys = ('secret', 'sshh')

        bf1 = clk.generate_clks(s1, schema, keys)
        bf2 = clk.generate_clks(s2, schema, keys)

        self.assertEqual(len(bf1), 100)
        self.assertEqual(len(bf2), 100)

        # An "exact match" bloomfilter comparison:
        set1 = set(bf1)
        set2 = set(bf2)

        self.assertGreaterEqual(
            len(set1 & set2), 80,
            "Expected at least 80 hashes to be exactly the same")