Exemple #1
0
def test_project_binary_data_uploaded(requests, valid_project_params, binary_test_file_path):
    new_project_data = requests.post(url + '/projects',
                                     json={
                                         'schema': {},
                                         **valid_project_params
                                     }).json()
    update_tokens = new_project_data['update_tokens']
    expected_number_parties = get_expected_number_parties(valid_project_params)
    assert len(update_tokens) == expected_number_parties

    for token in update_tokens:
        upload_binary_data_from_file(
            requests,
            binary_test_file_path, new_project_data['project_id'], token, 1000)

    run_id = post_run(requests, new_project_data, 0.99)
    result = get_run_result(requests, new_project_data, run_id, wait=True)

    if valid_project_params['result_type'] == 'groups':
        assert 'groups' in result
        groups = result['groups']
        assert len(groups) == 1000
        for group in groups:
            dataset_indices = {di for di, _ in group}
            record_indices = {ri for _, ri in group}
            assert len(record_indices) == 1
            assert dataset_indices == set(range(expected_number_parties))
        # Check every record is represented
        all_record_indices = {next(iter(group))[1] for group in groups}
        assert all_record_indices == set(range(1000))
def test_permutation(requests, the_truth):
    project_data, (r_a, r_b) = create_project_upload_data(
        requests, (the_truth['clks_a'], the_truth['clks_b']),
        result_type='permutations')
    run = post_run(requests, project_data, threshold=the_truth['threshold'])
    mask_result = get_run_result(requests, project_data, run, timeout=240)
    perm_a_result = get_run_result(requests,
                                   project_data,
                                   run,
                                   result_token=r_a['receipt_token'],
                                   wait=False)
    perm_b_result = get_run_result(requests,
                                   project_data,
                                   run,
                                   result_token=r_b['receipt_token'],
                                   wait=False)
    # compare permutations and mask against mapping of the truth
    permutation_a = inverse_of_permutation(perm_a_result['permutation'])
    permutation_b = inverse_of_permutation(perm_b_result['permutation'])
    groups = the_truth['groups']
    # Use a mapping output to simplify the checking.
    mapping = dict(anonlink.solving.pairs_from_groups(groups))

    # NB: Anonlink is more strict on enforcing the k parameter, so there
    # is a small chance the below won't hold. This should only be the
    # case for more noisy problems.
    for a, b, m in zip(permutation_a, permutation_b, mask_result['mask']):
        if m == 1:
            assert a in mapping, f"Unexpected link was included - run {run}"
            assert mapping[
                a] == b, f"Expected link from {a} was incorrect - run {run}"
        else:
            assert a not in mapping, f"Expected link was masked out - run {run}"
def test_groups_correctness(requests):
    # We assume that anonlink computes the right results.

    with open(DATA_PATH, 'rb') as f:
        # Here's some filters I prepared earlier.
        filters = pickle.load(f)

    candidate_pairs = anonlink.candidate_generation.find_candidate_pairs(
        filters, anonlink.similarities.dice_coefficient_accelerated, THRESHOLD)
    true_groups = anonlink.solving.greedy_solve(candidate_pairs)

    filter_size = len(filters[0][0])
    assert all(
        len(filter_) == filter_size for dataset in filters
        for filter_ in dataset)
    packed_filters = [
        b''.join(binary_pack_for_upload(f, filter_size)) for f in filters
    ]
    project_data, _ = create_project_upload_data(requests,
                                                 packed_filters,
                                                 result_type='groups',
                                                 binary=True,
                                                 hash_size=DATA_HASH_SIZE)
    try:
        run = post_run(requests, project_data, threshold=THRESHOLD)
        result_groups = get_run_result(requests, project_data, run)['groups']
    finally:
        delete_project(requests, project_data)

    # Compare ES result with anonlink.
    result_group_set = {frozenset(map(tuple, g)) for g in result_groups}
    true_group_set = set(map(frozenset, true_groups))
    assert result_group_set == true_group_set
Exemple #4
0
def test_run_status_without_clks(requests):
    project = create_project_no_data(requests)

    run_id = post_run(requests, project, 0.9)
    status = get_run_status(requests, project, run_id)

    is_run_status(status)
    assert status['state'] == 'created'
def test_list_run_after_posting_runs(requests):
    with temporary_blank_project(requests, result_type='groups') as project:

        for i in range(1, 11):
            run_id = post_run(requests, project, 0.95)
            # Check run listing has changed
            runs = get_runs(requests, project)
            assert len(runs) == i
def _create_data_linkage_run(requests, result_type_number_parties):
    result_type, number_parties = result_type_number_parties
    project, _ = create_project_upload_fake_data(requests,
                                                 [100] * number_parties,
                                                 overlap=0.5,
                                                 result_type=result_type)
    run_id = post_run(requests, project, 1.0)
    return project, run_id
def test_posting_run_before_data_upload(requests, project):
    run_id = post_run(requests, project, 0.95)
    runs = get_runs(requests, project)

    assert len(runs) == 1
    run = runs[0]
    assert 'run_id' in run
    assert 'time_added' in run
    assert 'state' in run
    assert run['state'] == 'created'
def test_posting_run_after_data_upload(requests, project):
    run_id = post_run(requests, project, 0.95)
    runs = get_runs(requests, project)

    assert len(runs) == 1
    for run in runs:
        assert 'run_id' in run
        assert run['run_id'] == run_id
        assert 'time_added' in run
        assert 'state' in run
def test_run_similarity_score_results(requests, similarity_scores_project, threshold):
    run_id = post_run(requests, similarity_scores_project, threshold)
    result = get_run_result(requests, similarity_scores_project, run_id, timeout=240)
    assert 'similarity_scores' in result
    for (party_id_1, rec_id_1), (party_id_2, rec_id_2), score in result['similarity_scores']:
        assert 0.0 <= score >= 1.0
        assert 0 <= party_id_1
        assert 0 <= party_id_2
        assert party_id_1 != party_id_2
        assert 0 <= rec_id_1
        assert 0 <= rec_id_2
Exemple #10
0
def test_project_json_data_upload_with_mismatched_encoded_size(
        requests, result_type_number_parties):
    result_type, number_parties = result_type_number_parties

    data = [generate_json_serialized_clks(500, 64 if i == 0 else 256)
            for i in range(number_parties)]

    new_project_data, _ = create_project_upload_data(
        requests, data, result_type=result_type)

    with pytest.raises(AssertionError):
        run_id = post_run(requests, new_project_data, 0.9)
        get_run_result(requests, new_project_data, run_id, wait=True)
Exemple #11
0
def test_project_json_data_upload_with_too_small_encoded_size(
        requests, result_type_number_parties):
    result_type, number_parties = result_type_number_parties
    new_project_data, _ = create_project_upload_fake_data(
        requests,
        [500] * number_parties,
        overlap=0.8,
        result_type=result_type,
        encoding_size=4
    )

    with pytest.raises(AssertionError):
        run_id = post_run(requests, new_project_data, 0.9)
        get_run_result(requests, new_project_data, run_id, wait=True)
def test_groups(requests, the_truth):
    project_data, _ = create_project_upload_data(
        requests, (the_truth['clks_a'], the_truth['clks_b']),
        result_type='groups')
    run = post_run(requests, project_data, threshold=the_truth['threshold'])
    result = get_run_result(requests, project_data, run, timeout=240)
    # compare mapping with the truth
    result_groups = result['groups']
    true_groups = the_truth['groups']

    result_groups = frozenset(
        frozenset(map(tuple, group)) for group in result_groups)
    true_groups = frozenset(map(frozenset, true_groups))

    assert result_groups == true_groups
def test_run_groups_results(requests, groups_project, threshold):
    run_id = post_run(requests, groups_project, threshold)
    result = get_run_result(requests, groups_project, run_id, timeout=240)
    
    assert 'groups' in result
    groups = result['groups']

    # All groups have at least two records
    assert all(len(g) >= 2 for g in groups)  
    
    # All records consist of a record index and dataset index
    assert all(all(len(i) == 2 for i in g) for g in groups)
    assert all(all(isinstance(i, int) and isinstance(j, int)
                   for i, j in g)
               for g in groups)
def test_project_json_data_upload_with_invalid_encoded_size(
        requests, result_type_number_parties):
    result_type, number_parties = result_type_number_parties
    new_project_data, _ = create_project_upload_fake_data(
        requests,
        [500] * number_parties,
        overlap=0.8,
        result_type=result_type,
        encoding_size=20  # not multiple of 8
    )

    with pytest.raises(AssertionError):
        run_id = post_run(requests, new_project_data, 0.9)
        get_run_result(requests,
                       new_project_data,
                       run_id,
                       wait=True,
                       timeout=240)
def test_project_binary_data_upload_with_different_encoded_size(
        requests, encoding_size, valid_project_params):
    expected_number_parties = get_expected_number_parties(valid_project_params)
    new_project_data = requests.post(url + '/projects',
                                     json={
                                         'schema': {},
                                         **valid_project_params
                                     }).json()

    common = next(
        binary_pack_for_upload(generate_clks(1, encoding_size), encoding_size))

    data = []
    for i in range(expected_number_parties):
        generated_clks = generate_clks(499, encoding_size)
        packed_clks = binary_pack_for_upload(generated_clks, encoding_size)
        packed_joined = b''.join(packed_clks)
        packed_with_common = (packed_joined + common if i == 0 else common +
                              packed_joined)
        data.append(packed_with_common)

    project_id = new_project_data['project_id']
    for d, token in zip(data, new_project_data['update_tokens']):
        assert len(d) == 500 * encoding_size
        upload_binary_data(requests,
                           d,
                           project_id,
                           token,
                           500,
                           size=encoding_size)

    run_id = post_run(requests, new_project_data, 0.99)
    result = get_run_result(requests,
                            new_project_data,
                            run_id,
                            wait=True,
                            timeout=240)
    if valid_project_params['result_type'] == 'groups':
        assert 'groups' in result
        groups = result['groups']
        groups_set = {frozenset(map(tuple, group)) for group in groups}
        common_set = frozenset(
            (i, 499 if i == 0 else 0) for i in range(expected_number_parties))
        assert common_set in groups_set
def test_run_permutations_results(requests, permutations_project, threshold):
    run_id = post_run(requests, permutations_project, threshold)
    mask_result = get_run_result(requests, permutations_project, run_id, timeout=240)
    assert 'mask' in mask_result
    assert len(mask_result['mask']) == min(permutations_project['size'])

    # Get results using receipt_token A and B
    token1 = permutations_project['dp_responses'][0]['receipt_token']
    result1 = get_run_result(requests, permutations_project, run_id, token1, wait=False)
    assert 'permutation' in result1
    assert 'rows' in result1
    assert result1['rows'] == len(mask_result['mask'])

    token2 = permutations_project['dp_responses'][1]['receipt_token']
    result2 = get_run_result(requests, permutations_project, run_id, token2, wait=False)
    assert 'permutation' in result2
    assert 'rows' in result2
    assert result2['rows'] == result1['rows']
    assert result2['rows'] == len(mask_result['mask'])
Exemple #17
0
def test_project_json_data_upload_with_various_encoded_sizes(
        requests,
        encoding_size, result_type_number_parties):
    result_type, number_parties = result_type_number_parties
    new_project_data, _ = create_project_upload_fake_data(
        requests,
        [500] * number_parties,
        overlap=0.8,
        result_type=result_type,
        encoding_size=encoding_size
    )

    run_id = post_run(requests, new_project_data, 0.9)
    result = get_run_result(requests, new_project_data, run_id, wait=True)
    if result_type == 'groups':
        assert 'groups' in result
        # This is a pretty bad bound, but we're not testing the
        # accuracy.
        assert len(result['groups']) >= 400
def test_similarity_scores(requests, the_truth):
    project_data, _ = create_project_upload_data(
        requests,
        (the_truth['clks_a'], the_truth['clks_b']),
        result_type='similarity_scores')
    run = post_run(requests, project_data, threshold=the_truth['threshold'])
    result = get_run_result(requests, project_data, run, timeout=60)
    
    true_scores = the_truth['similarity_scores']
    result_scores = {tuple(index for _, index in sorted([a, b])): score
                     for a, b, score in result['similarity_scores']}

    # Anonlink is more strict on enforcing the k parameter. Hence the
    # subset.
    assert true_scores.keys() <= result_scores.keys()

    for pair in true_scores:
        assert true_scores[pair] == result_scores[pair]

    delete_project(requests, project_data)
def test_run_mapping_results_no_data(requests):
    empty_project = create_project_no_data(requests)
    run_id = post_run(requests, empty_project, 0.95)
    get_run_result(requests, empty_project, run_id, expected_status=404, wait=False)
Exemple #20
0
def test_project_upload_external_data(requests, a_blocking_project, binary_test_file_path):
    project = a_blocking_project
    blocking_data = json.dumps(
        {str(encoding_id): list({str(encoding_id % 3), str(encoding_id % 13)}) for encoding_id in range(1000)}).encode()

    mc, upload_info = get_temp_upload_client(project, requests, project['update_tokens'][0])

    _upload_encodings_and_blocks(mc, upload_info, blocking_data, binary_test_file_path)

    # Should be able to notify the service that we've uploaded data
    res = requests.post(url + f"projects/{project['project_id']}/clks",
                        headers={'Authorization': project['update_tokens'][0]},
                        json={
                            'encodings': {
                                'file': {
                                    'bucket': upload_info['bucket'],
                                    'path': upload_info['path'] + "/encodings",
                                }
                            },
                            'blocks': {
                                'file': {
                                    'bucket': upload_info['bucket'],
                                    'path': upload_info['path'] + "/blocks",
                                }
                            }

                        }
                       )
    assert res.status_code == 201

    # If the second data provider uses the same path to upload data, that shouldn't work
    res2 = requests.post(url + f"projects/{project['project_id']}/clks",
                        headers={'Authorization': project['update_tokens'][1]},
                        json={
                            'encodings': {
                                'file': {
                                    'bucket': upload_info['bucket'],
                                    'path': upload_info['path'] + "/encodings",
                                }
                            },
                            'blocks': {
                                'file': {
                                    'bucket': upload_info['bucket'],
                                    'path': upload_info['path'] + "/blocks",
                                }
                            }

                        }
                       )
    assert res2.status_code == 403

    mc2, upload_info2 = get_temp_upload_client(project, requests, project['update_tokens'][1])
    _upload_encodings_and_blocks(mc2, upload_info2, blocking_data, binary_test_file_path)

    # If the second data provider uses the correct path to upload data, that should work
    res3 = requests.post(url + f"projects/{project['project_id']}/clks",
                        headers={'Authorization': project['update_tokens'][1]},
                        json={
                            'encodings': {
                                'file': {
                                    'bucket': upload_info2['bucket'],
                                    'path': upload_info2['path'] + "/encodings",
                                }
                            },
                            'blocks': {
                                'file': {
                                    'bucket': upload_info2['bucket'],
                                    'path': upload_info2['path'] + "/blocks",
                                }
                            }
                        }
                       )
    assert res3.status_code == 201
    run_id = post_run(requests, project, threshold=0.95)
    result = get_run_result(requests, project, run_id, timeout=120)
    assert 'groups' in result