def test_delete_project_after_creating_run_with_clks( requests, result_type_number_parties): result_type, number_parties = result_type_number_parties project, _ = create_project_upload_fake_data(requests, [100] * number_parties, overlap=0.5, result_type=result_type) post_run(requests, project, 0.9) delete_project(requests, project)
def test_project_binary_data_upload_with_different_encoded_size( requests, encoding_size): new_project_data = requests.post(url + '/projects', json={ 'schema': {}, 'result_type': 'mapping', }).json() g1 = binary_pack_filters(generate_clks(499, encoding_size), encoding_size) g2 = binary_pack_filters(generate_clks(499, encoding_size), encoding_size) g3 = binary_pack_filters(generate_clks(1, encoding_size), encoding_size) def convert_generator_to_bytes(g): return b''.join(g) shared_entity = next(g3) f1 = convert_generator_to_bytes(g1) + shared_entity f2 = shared_entity + convert_generator_to_bytes(g2) upload_binary_data(requests, f1, new_project_data['project_id'], new_project_data['update_tokens'][0], 500, encoding_size) upload_binary_data(requests, f2, new_project_data['project_id'], new_project_data['update_tokens'][1], 500, encoding_size) run_id = post_run(requests, new_project_data, 0.99) result = get_run_result(requests, new_project_data, run_id, wait=True) assert 'mapping' in result assert result['mapping']['499'] == '0'
def test_project_binary_data_uploaded(requests): new_project_data = requests.post(url + '/projects', json={ 'schema': {}, 'result_type': 'mapping', }).json() small_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'testdata/clks_128B_1k.bin') upload_binary_data_from_file(requests, small_file_path, new_project_data['project_id'], new_project_data['update_tokens'][0], 1000) upload_binary_data_from_file(requests, small_file_path, new_project_data['project_id'], new_project_data['update_tokens'][1], 1000) run_id = post_run(requests, new_project_data, 0.99) result = get_run_result(requests, new_project_data, run_id, wait=True) assert 'mapping' in result # Since we uploaded the same file it should have identified the same rows as matches for i in range(1, 1000): assert str(i) in result['mapping'] assert result['mapping'][str(i)] == str(i)
def test_groups_correctness(requests): # We assume that anonlink computes the right results. with open(DATA_PATH, 'rb') as f: # Here's some filters I prepared earlier. filters = pickle.load(f) candidate_pairs = anonlink.candidate_generation.find_candidate_pairs( filters, anonlink.similarities.dice_coefficient_accelerated, THRESHOLD) true_groups = anonlink.solving.greedy_solve(candidate_pairs) filter_size = len(filters[0][0]) assert all( len(filter_) == filter_size for dataset in filters for filter_ in dataset) packed_filters = [ b''.join(binary_pack_filters(f, filter_size)) for f in filters ] project_data, _ = create_project_upload_data(requests, packed_filters, result_type='groups', binary=True, hash_size=DATA_HASH_SIZE) try: run = post_run(requests, project_data, threshold=THRESHOLD) result_groups = get_run_result(requests, project_data, run)['groups'] finally: delete_project(requests, project_data) # Compare ES result with anonlink. result_group_set = {frozenset(map(tuple, g)) for g in result_groups} true_group_set = set(map(frozenset, true_groups)) assert result_group_set == true_group_set
def test_permutation(requests, the_truth): project_data, (r_a, r_b) = create_project_upload_data( requests, (the_truth['clks_a'], the_truth['clks_b']), result_type='permutations') run = post_run(requests, project_data, threshold=the_truth['threshold']) mask_result = get_run_result(requests, project_data, run, timeout=60) perm_a_result = get_run_result(requests, project_data, run, result_token=r_a['receipt_token'], wait=False) perm_b_result = get_run_result(requests, project_data, run, result_token=r_b['receipt_token'], wait=False) # compare permutations and mask against mapping of the truth permutation_a = inverse_of_permutation(perm_a_result['permutation']) permutation_b = inverse_of_permutation(perm_b_result['permutation']) mapping = the_truth['mapping'] # NB: Anonlink is more strict on enforcing the k parameter, so there # is a small chance the below won't hold. This should only be the # case for more noisy problems. for a, b, m in zip(permutation_a, permutation_b, mask_result['mask']): if m == 1: assert a in mapping, f"Unexpected link was included - run {run}" assert mapping[ a] == b, f"Expected link from {a} was incorrect - run {run}" else: assert a not in mapping, f"Expected link was masked out - run {run}"
def test_run_description_no_data(requests, project): run_id = post_run(requests, project, 0.95) run = get_run(requests, project, run_id) assert 'run_id' in run assert 'notes' in run assert 'threshold' in run
def test_run_permutations_results(requests, permutations_project, threshold): run_id = post_run(requests, permutations_project, threshold) mask_result = get_run_result(requests, permutations_project, run_id) assert 'mask' in mask_result assert len(mask_result['mask']) == min(permutations_project['size']) # Get results using receipt_token A and B token1 = permutations_project['dp_responses'][0]['receipt_token'] result1 = get_run_result(requests, permutations_project, run_id, token1, wait=False) assert 'permutation' in result1 assert 'rows' in result1 assert result1['rows'] == len(mask_result['mask']) token2 = permutations_project['dp_responses'][1]['receipt_token'] result2 = get_run_result(requests, permutations_project, run_id, token2, wait=False) assert 'permutation' in result2 assert 'rows' in result2 assert result2['rows'] == result1['rows'] assert result2['rows'] == len(mask_result['mask'])
def test_run_mapping_results(requests, mapping_project): run_id = post_run(requests, mapping_project, 0.95) wait_approx_run_time(mapping_project['size']) result = get_run_result(requests, mapping_project, run_id) assert 'mapping' in result assert isinstance(result['mapping'], dict)
def test_run_status_without_clks(requests): project = create_project_no_data(requests) run_id = post_run(requests, project, 0.9) status = get_run_status(requests, project, run_id) is_run_status(status) assert status['state'] == 'created'
def test_list_run_after_posting_runs(requests): with temporary_blank_project(requests, result_type='mapping') as project: for i in range(1, 11): run_id = post_run(requests, project, 0.95) # Check run listing has changed runs = get_runs(requests, project) assert len(runs) == i
def test_run_mapping_results_no_data(requests): empty_project = create_project_no_data(requests) run_id = post_run(requests, empty_project, 0.95) get_run_result(requests, empty_project, run_id, expected_status=404, wait=False)
def test_run_description(requests, result_type): project, dp1, dp2 = create_project_upload_fake_data( requests, [100, 100], overlap=0.5, result_type=result_type) run_id = post_run(requests, project, 0.98) run = get_run(requests, project, run_id) assert 'run_id' in run assert 'notes' in run assert 'threshold' in run
def test_project_json_data_upload_with_too_small_encoded_size(requests): new_project_data, r1, r2 = create_project_upload_fake_data( requests, [500, 500], overlap=0.95, result_type='mapping', encoding_size=4) with pytest.raises(AssertionError): run_id = post_run(requests, new_project_data, 0.9) get_run_result(requests, new_project_data, run_id, wait=True)
def test_posting_run_before_data_upload(requests, project): run_id = post_run(requests, project, 0.95) runs = get_runs(requests, project) assert len(runs) == 1 run = runs[0] assert 'run_id' in run assert 'time_added' in run assert 'state' in run assert run['state'] == 'created'
def test_posting_run_after_data_upload(requests, project): run_id = post_run(requests, project, 0.95) runs = get_runs(requests, project) assert len(runs) == 1 for run in runs: assert 'run_id' in run assert run['run_id'] == run_id assert 'time_added' in run assert 'state' in run
def test_project_json_data_upload_with_mismatched_encoded_size(requests): d1 = generate_json_serialized_clks(500, 64) d2 = generate_json_serialized_clks(500, 256) new_project_data, r1, r2 = create_project_upload_data( requests, d1, d2, result_type='mapping') with pytest.raises(AssertionError): run_id = post_run(requests, new_project_data, 0.9) get_run_result(requests, new_project_data, run_id, wait=True)
def test_mapping(requests, the_truth): project_data, _, _ = create_project_upload_data(requests, the_truth['clks_a'], the_truth['clks_b'], result_type='mapping') run = post_run(requests, project_data, threshold=the_truth['threshold']) result = get_run_result(requests, project_data, run) # compare mapping with the truth mapping = {int(k): int(result['mapping'][k]) for k in result['mapping']} assert mapping.keys() == the_truth['mapping'].keys() for key, value in mapping.items(): assert value == the_truth['mapping'][key] assert the_truth['entity_ids_a'][key] == the_truth['entity_ids_b'][value]
def test_project_json_data_upload_with_various_encoded_sizes( requests, encoding_size): new_project_data, r1, r2 = create_project_upload_fake_data( requests, [500, 500], overlap=0.95, result_type='mapping', encoding_size=encoding_size) run_id = post_run(requests, new_project_data, 0.9) result = get_run_result(requests, new_project_data, run_id, wait=True) assert 'mapping' in result assert len(result['mapping']) >= 475
def test_project_json_data_upload_with_mismatched_encoded_size( requests, result_type_number_parties): result_type, number_parties = result_type_number_parties data = [generate_json_serialized_clks(500, 64 if i == 0 else 256) for i in range(number_parties)] new_project_data, _ = create_project_upload_data( requests, data, result_type=result_type) with pytest.raises(AssertionError): run_id = post_run(requests, new_project_data, 0.9) get_run_result(requests, new_project_data, run_id, wait=True)
def test_project_json_data_upload_with_too_small_encoded_size( requests, result_type_number_parties): result_type, number_parties = result_type_number_parties new_project_data, _ = create_project_upload_fake_data( requests, [500] * number_parties, overlap=0.8, result_type=result_type, encoding_size=4 ) with pytest.raises(AssertionError): run_id = post_run(requests, new_project_data, 0.9) get_run_result(requests, new_project_data, run_id, wait=True)
def test_similarity_scores(requests, the_truth): project_data, _, _ = create_project_upload_data(requests, the_truth['clks_a'], the_truth['clks_b'], result_type='similarity_scores') run = post_run(requests, project_data, threshold=the_truth['threshold']) result = get_run_result(requests, project_data, run, timeout=60) # compare the result with the truth ss = result['similarity_scores'] ts = the_truth['similarity_scores'] assert len(ss) == len(ts) for es_score, true_score in zip(ss, ts): assert es_score[0] == true_score[0] and es_score[1] == true_score[2] assert es_score[2] == pytest.approx(true_score[1], 1e-10), 'similarity scores are different' delete_project(requests, project_data)
def test_run_description(requests, result_type_number_parties): THRESHOLD = .98 result_type, number_parties = result_type_number_parties project, _ = create_project_upload_fake_data(requests, [100] * number_parties, overlap=0.5, result_type=result_type) run_id = post_run(requests, project, THRESHOLD) run = get_run(requests, project, run_id) assert 'run_id' in run assert 'notes' in run assert run['threshold'] == THRESHOLD
def test_run_groups_results(requests, groups_project, threshold): run_id = post_run(requests, groups_project, threshold) result = get_run_result(requests, groups_project, run_id) assert 'groups' in result groups = result['groups'] # All groups have at least two records assert all(len(g) >= 2 for g in groups) # All records consist of a record index and dataset index assert all(all(len(i) == 2 for i in g) for g in groups) assert all( all(isinstance(i, int) and isinstance(j, int) for i, j in g) for g in groups)
def test_groups(requests, the_truth): project_data, _ = create_project_upload_data( requests, (the_truth['clks_a'], the_truth['clks_b']), result_type='groups') run = post_run(requests, project_data, threshold=the_truth['threshold']) result = get_run_result(requests, project_data, run) # compare mapping with the truth result_groups = result['groups'] true_groups = the_truth['groups'] result_groups = frozenset( frozenset(map(tuple, group)) for group in result_groups) true_groups = frozenset(map(frozenset, true_groups)) assert result_groups == true_groups
def test_mapping(requests, the_truth): project_data, _ = create_project_upload_data( requests, (the_truth['clks_a'], the_truth['clks_b']), result_type='mapping') run = post_run(requests, project_data, threshold=the_truth['threshold']) result = get_run_result(requests, project_data, run) # compare mapping with the truth mapping = {int(k): int(result['mapping'][k]) for k in result['mapping']} # NB: Anonlink is more strict on enforcing the k parameter, so there # is a small chance the below won't hold. This should only be the # case for more noisy problems. assert mapping.keys() == the_truth['mapping'].keys() for key, value in mapping.items(): assert value == the_truth['mapping'][key] assert the_truth['entity_ids_a'][key] == the_truth['entity_ids_b'][ value]
def test_permutation(requests, the_truth): project_data, r_a, r_b = create_project_upload_data(requests, the_truth['clks_a'], the_truth['clks_b'], result_type='permutations') run = post_run(requests, project_data, threshold=the_truth['threshold']) mask_result = get_run_result(requests, project_data, run, timeout=60) perm_a_result = get_run_result(requests, project_data, run, result_token=r_a['receipt_token'], wait=False) perm_b_result = get_run_result(requests, project_data, run, result_token=r_b['receipt_token'], wait=False) # compare permutations and mask against mapping of the truth permutation_a = inverse_of_permutation(perm_a_result['permutation']) permutation_b = inverse_of_permutation(perm_b_result['permutation']) mapping = the_truth['mapping'] for a, b, m in zip(permutation_a, permutation_b, mask_result['mask']): if m == 1: assert a in mapping, f"Unexpected link was included - run {run}" assert mapping[a] == b, f"Expected link from {a} was incorrect - run {run}" else: assert a not in mapping, f"Expected link was masked out - run {run}"
def test_similarity_scores(requests, the_truth): project_data, _ = create_project_upload_data( requests, (the_truth['clks_a'], the_truth['clks_b']), result_type='similarity_scores') run = post_run(requests, project_data, threshold=the_truth['threshold']) result = get_run_result(requests, project_data, run, timeout=60) true_scores = the_truth['similarity_scores'] result_scores = {(a, b): sim for a, b, sim in result['similarity_scores']} # Anonlink is more strict on enforcing the k parameter. Hence the # subset. assert true_scores.keys() <= result_scores.keys() for pair in true_scores: assert true_scores[pair] == result_scores[pair] delete_project(requests, project_data)
def test_project_binary_data_upload_with_different_encoded_size( requests, encoding_size, valid_project_params): expected_number_parties = get_expected_number_parties(valid_project_params) new_project_data = requests.post(url + '/projects', json={ 'schema': {}, **valid_project_params }).json() common = next(binary_pack_filters(generate_clks(1, encoding_size), encoding_size)) data = [] for i in range(expected_number_parties): generated_clks = generate_clks(499, encoding_size) packed_clks = binary_pack_filters(generated_clks, encoding_size) packed_joined = b''.join(packed_clks) packed_with_common = ( packed_joined + common if i == 0 else common + packed_joined) data.append(packed_with_common) project_id = new_project_data['project_id'] for d, token in zip(data, new_project_data['update_tokens']): assert len(d) == 500 * encoding_size upload_binary_data( requests, d, project_id, token, 500, size=encoding_size) run_id = post_run(requests, new_project_data, 0.99) result = get_run_result(requests, new_project_data, run_id, wait=True) if valid_project_params['result_type'] == 'mapping': assert 'mapping' in result assert result['mapping']['499'] == '0' elif valid_project_params['result_type'] == 'groups': assert 'groups' in result groups = result['groups'] groups_set = {frozenset(map(tuple, group)) for group in groups} common_set = frozenset( (i, 499 if i == 0 else 0) for i in range(expected_number_parties)) assert common_set in groups_set
def test_project_binary_data_uploaded(requests, valid_project_params): new_project_data = requests.post(url + '/projects', json={ 'schema': {}, **valid_project_params }).json() update_tokens = new_project_data['update_tokens'] expected_number_parties = get_expected_number_parties(valid_project_params) assert len(update_tokens) == expected_number_parties small_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'testdata/clks_128B_1k.bin') for token in update_tokens: upload_binary_data_from_file( requests, small_file_path, new_project_data['project_id'], token, 1000) run_id = post_run(requests, new_project_data, 0.99) result = get_run_result(requests, new_project_data, run_id, wait=True) if valid_project_params['result_type'] == 'mapping': assert 'mapping' in result # Since we uploaded the same file it should have identified the # same rows as matches for i in range(1, 1000): assert str(i) in result['mapping'] assert result['mapping'][str(i)] == str(i) elif valid_project_params['result_type'] == 'groups': assert 'groups' in result groups = result['groups'] assert len(groups) == 1000 for group in groups: dataset_indices = {di for di, _ in group} record_indices = {ri for _, ri in group} assert len(record_indices) == 1 assert dataset_indices == set(range(expected_number_parties)) # Check every record is represented all_record_indices = {next(iter(group))[1] for group in groups} assert all_record_indices == set(range(1000))
def test_project_json_data_upload_with_various_encoded_sizes( requests, encoding_size, result_type_number_parties): result_type, number_parties = result_type_number_parties new_project_data, _ = create_project_upload_fake_data( requests, [500] * number_parties, overlap=0.8, result_type=result_type, encoding_size=encoding_size ) run_id = post_run(requests, new_project_data, 0.9) result = get_run_result(requests, new_project_data, run_id, wait=True) if result_type == 'mapping': assert 'mapping' in result assert len(result['mapping']) >= 400 elif result_type == 'groups': assert 'groups' in result # This is a pretty bad bound, but we're not testing the # accuracy. assert len(result['groups']) >= 400