예제 #1
0
def _test_stats(pii, schema, keys):
    counts = [deserialize_bitarray(c).count() for c in
              clk.generate_clks(pii, schema, keys)]
    print('_test_stats: counts = ', counts)
    ov = OnlineMeanVariance()
    ov.update(counts)
    return ov.mean(), ov.std()
예제 #2
0
    def test_ser_deser_inverse(self):
        numBytes = 128
        ba = randomBitarray(numBytes)

        ser = serialize_bitarray(ba)
        # https://stackoverflow.com/questions/4715415/base64-what-is-the-worst-possible-increase-in-space-usage
        self.assertEqual(len(ser), ceil(numBytes / 3.0) * 4)

        des = deserialize_bitarray(ser)
        self.assertEqual(ba, des)
예제 #3
0
def plot(clk_json):
    try:
        # data was writen with: json.dump({'clks': clk_data}, output); so ...
        clks = json.load(clk_json)['clks']
    except ValueError as e:  # In Python 3 we can be more specific
        # with json.decoder.JSONDecodeError,
        # but that doesn't exist in Python 2.
        msg = 'The input is not a valid JSON file.'
        raise_from(DescribeError(msg), e)
        
    if len(clks) == 0:
        msg = 'No clks found'
        raise DescribeError(msg)

    popcounts = [deserialize_bitarray(clk).count() for clk in clks]
    plot_hist(popcounts, bincount=60, title='popcounts', xlab=True, showSummary=True)
예제 #4
0
def describe(clk_json):
    """show distribution of clk's popcounts using a ascii plot.
    """
    clks = json.load(clk_json)['clks']
    counts = get_encoding_popcounts([deserialize_bitarray(clk) for clk in clks])
    plot_hist(counts, bincount=60, title='popcounts', xlab=True, showSummary=True)
예제 #5
0
def upload(clk_json, project, apikey, output, blocks, server, retry_multiplier, retry_max_exp, retry_stop, profile,
           to_entityservice, verbose):
    """Upload CLK data to the Anonlink Entity server.

    Given a json file containing hashed clk data as CLK_JSON, upload to
    the entity resolution service.

    The following environment variables can be used to override default behaviour:

    * UPLOAD_OBJECT_STORE_SERVER

    """
    msg = 'CLK and Blocks' if blocks else 'CLK'

    if verbose:
        log("Uploading CLK data from {}".format(clk_json))
        log("Project ID: {}".format(project))
        log("Uploading {} data to the server".format(msg))

    rest_client = create_rest_client(server, retry_multiplier, retry_max_exp, retry_stop, verbose)

    if verbose:
        log("Fetching temporary credentials")
    try:
        res = rest_client.get_temporary_objectstore_credentials(project, apikey)
        credentials = res['credentials']
        upload_info = res['upload']
        upload_to_object_store = True
    except ServiceError as e:
        log("Failed to retrieve temporary credentials")
        upload_to_object_store = False

    # metadata for clks
    with open(clk_json, 'rb') as f:
        clks = json.load(f)['clks']

    hash_count = len(clks)
    hash_size = (len(deserialize_bitarray(clks[0])) + 7) // 8
    encoding_metadata = {
        'hash-count': hash_count,
        'hash-size': hash_size
    }

    if upload_to_object_store and not to_entityservice:
        object_store_credential_providers = []
        if profile is not None:
            object_store_credential_providers.append(AWSConfigProvider(profile=profile))

        endpoint = os.getenv('UPLOAD_OBJECT_STORE_SERVER', upload_info['endpoint'])

        object_store_credential_providers.append(
            StaticProvider(access_key=credentials['AccessKeyId'],
                   secret_key=credentials['SecretAccessKey'],
                   session_token=credentials['SessionToken']))


        mc = Minio(
            endpoint,
            credentials=ChainedProvider(object_store_credential_providers),
            region='us-east-1',
            secure=upload_info['secure']
        )

        if verbose:
            log('Checking we have permission to upload')

        mc.put_object(upload_info['bucket'], upload_info['path'] + "/upload-test", io.BytesIO(b"something"), length=9)
        print(upload_info['bucket'])

    if blocks:
        # check size of blocks and clks consistent
        with open(blocks, 'rb') as f:
            block_counts = next(ijson.items(f, 'meta.source.clk_count.item'))
            msg = 'Size inconsistency: there are {} CLKs but {} encoding-to-blocks maps'.format(hash_count, block_counts)
            assert block_counts == hash_count, msg

        if upload_to_object_store and not to_entityservice:
            print('Anonlink client: Uploading to the external object store - MINIO')
            # upload to Minio
            progress1 = Progress()
            progress1.display_name = f'Upload {clk_json.split("/")[-1]}'
            mc.fput_object(upload_info['bucket'], upload_info['path'] + "/encodings.json", clk_json, progress=progress1,
                           metadata=encoding_metadata)

            progress2 = Progress()
            progress2.display_name = f'Upload {blocks.split("/")[-1]}'
            mc.fput_object(upload_info['bucket'], upload_info['path'] + "/blocks.json", blocks, progress=progress2)

            clk_file = upload_info['path'] + '/encodings.json'
            block_file = upload_info['path'] + '/blocks.json'

            # upload metadata to entity service
            to_entity_service = {
                'encodings': {'file': {'path': clk_file, 'bucket': upload_info['bucket']},
                              'credentials': credentials},
                'blocks': {'file': {'path': block_file, 'bucket': upload_info['bucket']},
                           'credentials': credentials}
            }
            to_entity_service_stream = io.StringIO()
            json.dump(to_entity_service, to_entity_service_stream)
            to_entity_service_stream.seek(0)
            response = rest_client.project_upload_clks(project, apikey, to_entity_service_stream)

        else:
            print('Anonlink client: Uploading to entity service')
            with open(clk_json, 'rb') as encodings:
                with open(blocks, 'rb') as blockings:
                    out = combine_clks_blocks(encodings, blockings)
                    response = rest_client.project_upload_clks(project, apikey, out)

    else:
        if upload_to_object_store and not to_entityservice:
            print('Anonlink client: Uploading to the external object store - MINIO')
            progress = Progress()
            progress.display_name = f'Upload {clk_json.split("/")[-1]}'
            mc.fput_object(upload_info['bucket'], upload_info['path'] + "/encodings.json", clk_json, progress=progress,
                           metadata=encoding_metadata)

            # upload metadata to entity service
            clk_file = upload_info['path'] + '/encodings.json'
            to_entity_service = {
                'encodings': {'file': {'path': clk_file, 'bucket': upload_info['bucket']},
                             'credentials': credentials}
            }
            to_entity_service_stream = io.StringIO()
            json.dump(to_entity_service, to_entity_service_stream)
            to_entity_service_stream.seek(0)
            response = rest_client.project_upload_clks(project, apikey, to_entity_service_stream)
        else:
            print('Anonlink client: Uploading to entity service')
            with open(clk_json, 'rb') as encodings:
                response = rest_client.project_upload_clks(project, apikey, encodings)

    if verbose:
        msg = '\n'.join(['{}: {}'.format(key, value) for key, value in response.items()])
        log(msg)

    json.dump(response, output)