def _test_stats(pii, schema, keys): counts = [deserialize_bitarray(c).count() for c in clk.generate_clks(pii, schema, keys)] print('_test_stats: counts = ', counts) ov = OnlineMeanVariance() ov.update(counts) return ov.mean(), ov.std()
def test_ser_deser_inverse(self): numBytes = 128 ba = randomBitarray(numBytes) ser = serialize_bitarray(ba) # https://stackoverflow.com/questions/4715415/base64-what-is-the-worst-possible-increase-in-space-usage self.assertEqual(len(ser), ceil(numBytes / 3.0) * 4) des = deserialize_bitarray(ser) self.assertEqual(ba, des)
def plot(clk_json): try: # data was writen with: json.dump({'clks': clk_data}, output); so ... clks = json.load(clk_json)['clks'] except ValueError as e: # In Python 3 we can be more specific # with json.decoder.JSONDecodeError, # but that doesn't exist in Python 2. msg = 'The input is not a valid JSON file.' raise_from(DescribeError(msg), e) if len(clks) == 0: msg = 'No clks found' raise DescribeError(msg) popcounts = [deserialize_bitarray(clk).count() for clk in clks] plot_hist(popcounts, bincount=60, title='popcounts', xlab=True, showSummary=True)
def describe(clk_json): """show distribution of clk's popcounts using a ascii plot. """ clks = json.load(clk_json)['clks'] counts = get_encoding_popcounts([deserialize_bitarray(clk) for clk in clks]) plot_hist(counts, bincount=60, title='popcounts', xlab=True, showSummary=True)
def upload(clk_json, project, apikey, output, blocks, server, retry_multiplier, retry_max_exp, retry_stop, profile, to_entityservice, verbose): """Upload CLK data to the Anonlink Entity server. Given a json file containing hashed clk data as CLK_JSON, upload to the entity resolution service. The following environment variables can be used to override default behaviour: * UPLOAD_OBJECT_STORE_SERVER """ msg = 'CLK and Blocks' if blocks else 'CLK' if verbose: log("Uploading CLK data from {}".format(clk_json)) log("Project ID: {}".format(project)) log("Uploading {} data to the server".format(msg)) rest_client = create_rest_client(server, retry_multiplier, retry_max_exp, retry_stop, verbose) if verbose: log("Fetching temporary credentials") try: res = rest_client.get_temporary_objectstore_credentials(project, apikey) credentials = res['credentials'] upload_info = res['upload'] upload_to_object_store = True except ServiceError as e: log("Failed to retrieve temporary credentials") upload_to_object_store = False # metadata for clks with open(clk_json, 'rb') as f: clks = json.load(f)['clks'] hash_count = len(clks) hash_size = (len(deserialize_bitarray(clks[0])) + 7) // 8 encoding_metadata = { 'hash-count': hash_count, 'hash-size': hash_size } if upload_to_object_store and not to_entityservice: object_store_credential_providers = [] if profile is not None: object_store_credential_providers.append(AWSConfigProvider(profile=profile)) endpoint = os.getenv('UPLOAD_OBJECT_STORE_SERVER', upload_info['endpoint']) object_store_credential_providers.append( StaticProvider(access_key=credentials['AccessKeyId'], secret_key=credentials['SecretAccessKey'], session_token=credentials['SessionToken'])) mc = Minio( endpoint, credentials=ChainedProvider(object_store_credential_providers), region='us-east-1', secure=upload_info['secure'] ) if verbose: log('Checking we have permission to upload') mc.put_object(upload_info['bucket'], upload_info['path'] + "/upload-test", io.BytesIO(b"something"), length=9) print(upload_info['bucket']) if blocks: # check size of blocks and clks consistent with open(blocks, 'rb') as f: block_counts = next(ijson.items(f, 'meta.source.clk_count.item')) msg = 'Size inconsistency: there are {} CLKs but {} encoding-to-blocks maps'.format(hash_count, block_counts) assert block_counts == hash_count, msg if upload_to_object_store and not to_entityservice: print('Anonlink client: Uploading to the external object store - MINIO') # upload to Minio progress1 = Progress() progress1.display_name = f'Upload {clk_json.split("/")[-1]}' mc.fput_object(upload_info['bucket'], upload_info['path'] + "/encodings.json", clk_json, progress=progress1, metadata=encoding_metadata) progress2 = Progress() progress2.display_name = f'Upload {blocks.split("/")[-1]}' mc.fput_object(upload_info['bucket'], upload_info['path'] + "/blocks.json", blocks, progress=progress2) clk_file = upload_info['path'] + '/encodings.json' block_file = upload_info['path'] + '/blocks.json' # upload metadata to entity service to_entity_service = { 'encodings': {'file': {'path': clk_file, 'bucket': upload_info['bucket']}, 'credentials': credentials}, 'blocks': {'file': {'path': block_file, 'bucket': upload_info['bucket']}, 'credentials': credentials} } to_entity_service_stream = io.StringIO() json.dump(to_entity_service, to_entity_service_stream) to_entity_service_stream.seek(0) response = rest_client.project_upload_clks(project, apikey, to_entity_service_stream) else: print('Anonlink client: Uploading to entity service') with open(clk_json, 'rb') as encodings: with open(blocks, 'rb') as blockings: out = combine_clks_blocks(encodings, blockings) response = rest_client.project_upload_clks(project, apikey, out) else: if upload_to_object_store and not to_entityservice: print('Anonlink client: Uploading to the external object store - MINIO') progress = Progress() progress.display_name = f'Upload {clk_json.split("/")[-1]}' mc.fput_object(upload_info['bucket'], upload_info['path'] + "/encodings.json", clk_json, progress=progress, metadata=encoding_metadata) # upload metadata to entity service clk_file = upload_info['path'] + '/encodings.json' to_entity_service = { 'encodings': {'file': {'path': clk_file, 'bucket': upload_info['bucket']}, 'credentials': credentials} } to_entity_service_stream = io.StringIO() json.dump(to_entity_service, to_entity_service_stream) to_entity_service_stream.seek(0) response = rest_client.project_upload_clks(project, apikey, to_entity_service_stream) else: print('Anonlink client: Uploading to entity service') with open(clk_json, 'rb') as encodings: response = rest_client.project_upload_clks(project, apikey, encodings) if verbose: msg = '\n'.join(['{}: {}'.format(key, value) for key, value in response.items()]) log(msg) json.dump(response, output)