def get_similarity_scores(filename): """ Read a CSV file from the object store containing the similarity scores and return a response that will stream the similarity scores. :param filename: name of the CSV file, obtained from the `similarity_scores` table :return: the similarity scores in a streaming JSON response. """ mc = connect_to_object_store() details = mc.stat_object(config.MINIO_BUCKET, filename) logger.info("Starting download stream of similarity scores.", filename=filename, filesize=details.size) try: csv_data_stream = iterable_to_stream( mc.get_object(config.MINIO_BUCKET, filename).stream()) # Process the CSV into JSON csv_text_stream = io.TextIOWrapper(csv_data_stream, encoding="utf-8") return Response(generate_scores(csv_text_stream), mimetype='application/json') except urllib3.exceptions.ResponseError: logger.warning( "Attempt to read the similarity scores file failed with an error response.", filename=filename) safe_fail_request(500, "Failed to retrieve similarity scores")
def _put_placeholder_empty_file(mc, log): sims = array.array('d') dset_is0 = array.array('I') dset_is1 = array.array('I') rec_is0 = array.array('I') rec_is1 = array.array('I') candidate_pairs = sims, (dset_is0, dset_is1), (rec_is0, rec_is1) empty_file_iter, empty_file_size \ = anonlink.serialization.dump_candidate_pairs_iter(candidate_pairs) empty_file_name = Config.SIMILARITY_SCORES_FILENAME_FMT.format( generate_code(12)) empty_file_stream = iterable_to_stream(empty_file_iter) try: mc.put_object(Config.MINIO_BUCKET, empty_file_name, empty_file_stream, empty_file_size) except minio.ResponseError: log.warning("Failed to store empty result in minio.") raise return 0, empty_file_size, empty_file_name
def _merge_files(mc, log, file0, file1): num0, filesize0, filename0 = file0 num1, filesize1, filename1 = file1 total_num = num0 + num1 file0_stream = mc.get_object(Config.MINIO_BUCKET, filename0) file1_stream = mc.get_object(Config.MINIO_BUCKET, filename1) merged_file_iter, merged_file_size \ = anonlink.serialization.merge_streams_iter( (file0_stream, file1_stream), sizes=(filesize0, filesize1)) merged_file_name = Config.SIMILARITY_SCORES_FILENAME_FMT.format( generate_code(12)) merged_file_stream = iterable_to_stream(merged_file_iter) try: mc.put_object(Config.MINIO_BUCKET, merged_file_name, merged_file_stream, merged_file_size) except minio.ResponseError: log.warning("Failed to store merged result in minio.") raise for del_err in mc.remove_objects(Config.MINIO_BUCKET, (filename0, filename1)): log.warning(f"Failed to delete result file " f"{del_err.object_name}. {del_err}") return total_num, merged_file_size, merged_file_name
def handle_raw_upload(project_id, dp_id, receipt_token, parent_span=None): # User has uploaded base64 encodings as JSON log = logger.bind(pid=project_id, dp_id=dp_id) log.info("Handling user provided base64 encodings") with DBConn() as db: if not check_project_exists(db, project_id): log.info("Project deleted, stopping immediately") return expected_count = get_number_of_hashes(db, dp_id) log.info(f"Expecting to handle {expected_count} encodings") mc = connect_to_object_store() # Input file is line separated base64 record encodings. raw_file = Config.RAW_FILENAME_FMT.format(receipt_token) raw_data_response = mc.get_object(Config.MINIO_BUCKET, raw_file) # Set up streaming processing pipeline buffered_stream = iterable_to_stream(raw_data_response.stream()) text_stream = io.TextIOWrapper(buffered_stream, newline='\n') clkcounts = [] def filter_generator(): log.debug("Deserializing json filters") first_encoding_size = None for i, line in enumerate(text_stream): ba = deserialize_bitarray(line) yield (ba, i, ba.count()) clkcounts.append(ba.count()) encsize = len(ba) if i == 0: first_encoding_size = encsize if encsize != first_encoding_size: raise ValueError("Encodings were not all the same size") log.info(f"Processed {len(clkcounts)} hashes") # We peek at the first element as we need the encoding size # for the ret of our processing pipeline python_filters = more_itertools.peekable(filter_generator()) # Note the len of a bitarray is returned in bits but we require # this to be a multiple of 8 so we use bytes. uploaded_encoding_size = len(python_filters.peek()[0]) // 8 # This is the first time we've seen the encoding size from this data provider try: check_dataproviders_encoding(project_id, uploaded_encoding_size) except InvalidEncodingError as e: log.warning(e.args[0]) handle_invalid_encoding_data(project_id, dp_id) with DBConn() as db: # Save the encoding size as metadata update_encoding_metadata_set_encoding_size(db, dp_id, uploaded_encoding_size) # Output file is our custom binary packed file filename = Config.BIN_FILENAME_FMT.format(receipt_token) bit_packed_element_size = binary_format(uploaded_encoding_size).size num_bytes = expected_count * bit_packed_element_size # If small enough preload the data into our redis cache if expected_count < Config.ENTITY_CACHE_THRESHOLD: log.info("Caching pickled clk data") python_filters = list(python_filters) cache.set_deserialized_filter(dp_id, python_filters) else: log.info("Not caching clk data as it is too large") packed_filters = binary_pack_filters(python_filters, uploaded_encoding_size) packed_filter_stream = iterable_to_stream(packed_filters) # Upload to object store log.info( f"Uploading {expected_count} encodings of size {uploaded_encoding_size} " + f"to object store. Total Size: {fmt_bytes(num_bytes)}") mc.put_object(Config.MINIO_BUCKET, filename, data=packed_filter_stream, length=num_bytes) with DBConn() as conn: update_encoding_metadata(conn, filename, dp_id, 'ready') # Now work out if all parties have added their data if clks_uploaded_to_project(project_id, check_data_ready=True): log.info("All parties' data present. Scheduling any queued runs") check_for_executable_runs.delay( project_id, handle_raw_upload.get_serialized_span())
def compute_filter_similarity(package, project_id, run_id, threshold, encoding_size, parent_span=None): """Compute filter similarity between a chunk of filters in dataprovider 1, and a chunk of filters in dataprovider 2. :param dict chunk_info: A chunk returned by ``anonlink.concurrency.split_to_chunks``. :param project_id: :param run_id: :param threshold: :param encoding_size: The size in bytes of each encoded entry :param parent_span: A serialized opentracing span context. :returns A 3-tuple: (num_results, result size in bytes, results_filename_in_object_store, ) """ log = logger.bind(pid=project_id, run_id=run_id) task_span = compute_filter_similarity.span def new_child_span(name, parent_scope=None): if parent_scope is None: parent_scope = compute_filter_similarity return compute_filter_similarity.tracer.start_active_span( name, child_of=parent_scope.span) log.debug(f"Computing similarities for {len(package)} chunks of filters") log.debug( "Checking that the resource exists (in case of run being canceled/deleted)" ) assert_valid_run(project_id, run_id, log) #chunk_info_dp1, chunk_info_dp2 = chunk_info def reindex_using_encoding_ids(recordarray, encoding_id_list): # Map results from "index in chunk" to encoding id. return array.array('I', [encoding_id_list[i] for i in recordarray]) num_results = 0 num_comparisons = 0 sim_results = [] with DBConn() as conn: if len(package) > 1: # multiple full blocks in one package with new_child_span( f'fetching-encodings of package of size {len(package)}'): package = get_encoding_chunks(conn, package, encoding_size=encoding_size) else: # this chunk is all part of one block with new_child_span(f'fetching-encodings of package with 1 chunk'): chunk_info_dp1, chunk_info_dp2 = package[0] chunk_with_ids_dp1, chunk_dp1_size = get_encoding_chunk( conn, chunk_info_dp1, encoding_size) entity_ids_dp1, chunk_dp1 = zip(*chunk_with_ids_dp1) chunk_info_dp1['encodings'] = chunk_dp1 chunk_info_dp1['entity_ids'] = entity_ids_dp1 chunk_with_ids_dp2, chunk_dp2_size = get_encoding_chunk( conn, chunk_info_dp2, encoding_size) entity_ids_dp2, chunk_dp2 = zip(*chunk_with_ids_dp2) chunk_info_dp2['encodings'] = chunk_dp2 chunk_info_dp2['entity_ids'] = entity_ids_dp2 log.debug('All encodings for package are fetched and deserialized') log.debug("Calculating filter similarities for work package") with new_child_span('comparing-encodings') as parent_scope: for chunk_dp1, chunk_dp2 in package: enc_dp1 = chunk_dp1['encodings'] enc_dp1_size = len(enc_dp1) enc_dp2 = chunk_dp2['encodings'] enc_dp2_size = len(enc_dp2) assert enc_dp1_size > 0, "Zero sized chunk in dp1" assert enc_dp2_size > 0, "Zero sized chunk in dp2" try: sims, (rec_is0, rec_is1 ) = anonlink.similarities.dice_coefficient_accelerated( datasets=(enc_dp1, enc_dp2), threshold=threshold, k=min(enc_dp1_size, enc_dp2_size)) except NotImplementedError as e: log.warning( f"Encodings couldn't be compared using anonlink. {e}") return rec_is0 = reindex_using_encoding_ids(rec_is0, chunk_dp1['entity_ids']) rec_is1 = reindex_using_encoding_ids(rec_is1, chunk_dp2['entity_ids']) num_results += len(sims) num_comparisons += enc_dp1_size * enc_dp2_size sim_results.append( (sims, (rec_is0, rec_is1), chunk_dp1['datasetIndex'], chunk_dp2['datasetIndex'])) log.debug( f'comparison is done. {num_comparisons} comparisons got {num_results} pairs above the threshold' ) ##### progess reporting log.debug('Encoding similarities calculated') with new_child_span('update-comparison-progress') as scope: # Update the number of comparisons completed save_current_progress(num_comparisons, run_id) scope.span.log_kv({ 'comparisons': num_comparisons, 'num_similar': num_results }) log.debug("Comparisons: {}, Links above threshold: {}".format( num_comparisons, num_results)) ###### results into file into minio with new_child_span('save-comparison-results-to-minio'): file_iters = [] file_sizes = [] for sims, (rec_is0, rec_is1), dp1_ds_idx, dp2_ds_idx in sim_results: num_sims = len(sims) if num_sims: # Make index arrays for serialization index_1 = array.array('I', (dp1_ds_idx, )) * num_sims index_2 = array.array('I', (dp2_ds_idx, )) * num_sims chunk_results = sims, (index_1, index_2), (rec_is0, rec_is1), bytes_iter, file_size \ = anonlink.serialization.dump_candidate_pairs_iter(chunk_results) file_iters.append(iterable_to_stream(bytes_iter)) file_sizes.append(file_size) if len(file_iters) > 1: # we need to merge them first into one ordered thingy merged_file_iter, merged_file_size \ = anonlink.serialization.merge_streams_iter(file_iters, sizes=file_sizes) merged_file_iter = iterable_to_stream(merged_file_iter) elif len(file_iters) == 1: merged_file_iter = file_iters[0] merged_file_size = file_sizes[0] else: return 0, None, None result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format( generate_code(12)) task_span.log_kv({"edges": num_results}) log.info("Writing {} intermediate results to file: {}".format( num_results, result_filename)) mc = connect_to_object_store() try: mc.put_object(Config.MINIO_BUCKET, result_filename, merged_file_iter, merged_file_size) except minio.ResponseError as err: log.warning("Failed to store result in minio: {}".format(err)) raise return num_results, merged_file_size, result_filename
def compute_filter_similarity(chunk_info, project_id, run_id, threshold, encoding_size, parent_span=None): """Compute filter similarity between a chunk of filters in dataprovider 1, and a chunk of filters in dataprovider 2. :param chunk_info: Chunk info returned by ``anonlink.concurrency.split_to_chunks``. Additionally, "storeFilename" is added to each dataset chunk. :param project_id: :param threshold: :param encoding_size: The size in bytes of each encoded entry :param parent_span: A serialized opentracing span context. @returns A 2-tuple: (num_results, results_filename_in_object_store) """ log = logger.bind(pid=project_id, run_id=run_id) log.debug("Computing similarity for a chunk of filters") span = compute_filter_similarity.span log.debug( "Checking that the resource exists (in case of job being canceled)") with DBConn() as db: if not check_project_exists(db, project_id) or not check_run_exists( db, project_id, run_id): log.info("Failing task as project or run not found in database.") raise DBResourceMissing("project or run not found in database") chunk_info_dp1, chunk_info_dp2 = chunk_info t0 = time.time() log.debug("Fetching and deserializing chunk of filters for dataprovider 1") chunk_dp1, chunk_dp1_size = get_chunk_from_object_store( chunk_info_dp1, encoding_size) t1 = time.time() log.debug("Fetching and deserializing chunk of filters for dataprovider 2") chunk_dp2, chunk_dp2_size = get_chunk_from_object_store( chunk_info_dp2, encoding_size) t2 = time.time() span.log_kv({'event': 'chunks are fetched and deserialized'}) log.debug("Calculating filter similarity") span.log_kv({'size1': chunk_dp1_size, 'size2': chunk_dp2_size}) chunk_results = anonlink.concurrency.process_chunk( chunk_info, (chunk_dp1, chunk_dp2), anonlink.similarities.dice_coefficient_accelerated, threshold, k=min(chunk_dp1_size, chunk_dp2_size)) t3 = time.time() span.log_kv({'event': 'similarities calculated'}) # Update the number of comparisons completed comparisons_computed = chunk_dp1_size * chunk_dp2_size save_current_progress(comparisons_computed, run_id) t4 = time.time() sims, _, _ = chunk_results num_results = len(sims) if num_results: result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format( generate_code(12)) log.info("Writing {} intermediate results to file: {}".format( num_results, result_filename)) bytes_iter, file_size \ = anonlink.serialization.dump_candidate_pairs_iter(chunk_results) iter_stream = iterable_to_stream(bytes_iter) mc = connect_to_object_store() try: mc.put_object(Config.MINIO_BUCKET, result_filename, iter_stream, file_size) except minio.ResponseError as err: log.warning("Failed to store result in minio") raise else: result_filename = None file_size = None t5 = time.time() log.info("run={} Comparisons: {}, Links above threshold: {}".format( run_id, comparisons_computed, len(chunk_results))) log.info( "Prep: {:.3f} + {:.3f}, Solve: {:.3f}, Progress: {:.3f}, Save: {:.3f}, Total: {:.3f}" .format(t1 - t0, t2 - t1, t3 - t2, t4 - t3, t5 - t4, t5 - t0)) return num_results, file_size, result_filename
def compute_filter_similarity(chunk_info, project_id, run_id, threshold, encoding_size, parent_span=None): """Compute filter similarity between a chunk of filters in dataprovider 1, and a chunk of filters in dataprovider 2. :param dict chunk_info: A chunk returned by ``anonlink.concurrency.split_to_chunks``. :param project_id: :param run_id: :param threshold: :param encoding_size: The size in bytes of each encoded entry :param parent_span: A serialized opentracing span context. :returns A 3-tuple: (num_results, result size in bytes, results_filename_in_object_store, ) """ log = logger.bind(pid=project_id, run_id=run_id) task_span = compute_filter_similarity.span def new_child_span(name, parent_scope=None): if parent_scope is None: parent_scope = compute_filter_similarity return compute_filter_similarity.tracer.start_active_span( name, child_of=parent_scope.span) log.debug("Computing similarity for a chunk of filters") log.debug( "Checking that the resource exists (in case of run being canceled/deleted)" ) assert_valid_run(project_id, run_id, log) chunk_info_dp1, chunk_info_dp2 = chunk_info with DBConn() as conn: with new_child_span('fetching-encodings') as parent_scope: with new_child_span('fetching-left-encodings', parent_scope): log.debug( "Fetching and deserializing chunk of filters for dataprovider 1" ) chunk_with_ids_dp1, chunk_dp1_size = get_encoding_chunk( conn, chunk_info_dp1, encoding_size) entity_ids_dp1, chunk_dp1 = zip(*chunk_with_ids_dp1) with new_child_span('fetching-right-encodings', parent_scope): log.debug( "Fetching and deserializing chunk of filters for dataprovider 2" ) chunk_with_ids_dp2, chunk_dp2_size = get_encoding_chunk( conn, chunk_info_dp2, encoding_size) entity_ids_dp2, chunk_dp2 = zip(*chunk_with_ids_dp2) log.debug('Both chunks are fetched and deserialized') task_span.log_kv({ 'size1': chunk_dp1_size, 'size2': chunk_dp2_size, 'chunk_info': chunk_info }) assert chunk_dp1_size > 0, "Zero sized chunk in dp1" assert chunk_dp2_size > 0, "Zero sized chunk in dp2" with new_child_span('comparing-encodings') as parent_scope: log.debug("Calculating filter similarity") with new_child_span('dice-call', parent_scope): try: sims, (rec_is0, rec_is1 ) = anonlink.similarities.dice_coefficient_accelerated( datasets=(chunk_dp1, chunk_dp2), threshold=threshold, k=min(chunk_dp1_size, chunk_dp2_size)) except NotImplementedError as e: log.warning("Encodings couldn't be compared using anonlink.") return with new_child_span('reindex-call', parent_scope): def reindex_using_encoding_ids(recordarray, encoding_id_list): # Map results from "index in chunk" to encoding id. return array.array('I', [encoding_id_list[i] for i in recordarray]) rec_is0 = reindex_using_encoding_ids(rec_is0, entity_ids_dp1) rec_is1 = reindex_using_encoding_ids(rec_is1, entity_ids_dp2) log.debug('Encoding similarities calculated') with new_child_span('update-comparison-progress') as scope: # Update the number of comparisons completed comparisons_computed = chunk_dp1_size * chunk_dp2_size save_current_progress(comparisons_computed, run_id) scope.span.log_kv({ 'comparisons': comparisons_computed, 'num_similar': len(sims) }) log.debug("Comparisons: {}, Links above threshold: {}".format( comparisons_computed, len(sims))) with new_child_span('save-comparison-results-to-minio'): num_results = len(sims) if num_results: result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format( generate_code(12)) task_span.log_kv({"edges": num_results}) log.info("Writing {} intermediate results to file: {}".format( num_results, result_filename)) # Make index arrays for serialization index_1 = array.array( 'I', (chunk_info_dp1["datasetIndex"], )) * num_results index_2 = array.array( 'I', (chunk_info_dp2["datasetIndex"], )) * num_results chunk_results = sims, (index_1, index_2), (rec_is0, rec_is1), bytes_iter, file_size \ = anonlink.serialization.dump_candidate_pairs_iter(chunk_results) iter_stream = iterable_to_stream(bytes_iter) mc = connect_to_object_store() try: mc.put_object(Config.MINIO_BUCKET, result_filename, iter_stream, file_size) except minio.ResponseError as err: log.warning("Failed to store result in minio") raise else: result_filename = None file_size = None return num_results, file_size, result_filename