def get_similarity_scores(filename): """ Read a CSV file from the object store containing the similarity scores and return a response that will stream the similarity scores. :param filename: name of the CSV file, obtained from the `similarity_scores` table :return: the similarity scores in a streaming JSON response. """ mc = connect_to_object_store() details = mc.stat_object(config.MINIO_BUCKET, filename) logger.info("Starting download stream of similarity scores.", filename=filename, filesize=details.size) try: sims_data_stream = mc.get_object(config.MINIO_BUCKET, filename) # TODO: Below is an Anonlink 'private' API. It should be made # public. sims_iter, *_ = anonlink.serialization._load_to_iterable( sims_data_stream) return Response(generate_scores(sims_iter), mimetype='application/json') except urllib3.exceptions.ResponseError: logger.warning( "Attempt to read the similarity scores file failed with an error response.", filename=filename) safe_fail_request(500, "Failed to retrieve similarity scores")
def get_similarity_scores(filename): """ Read a CSV file from the object store containing the similarity scores and return a response that will stream the similarity scores. :param filename: name of the CSV file, obtained from the `similarity_scores` table :return: the similarity scores in a streaming JSON response. """ mc = connect_to_object_store() details = mc.stat_object(config.MINIO_BUCKET, filename) logger.info("Starting download stream of similarity scores.", filename=filename, filesize=details.size) try: candidate_pair_binary_stream = mc.get_object(config.MINIO_BUCKET, filename) return Response(generate_scores(candidate_pair_binary_stream), mimetype='application/json') except urllib3.exceptions.ResponseError: logger.warning( "Attempt to read the similarity scores file failed with an error response.", filename=filename) safe_fail_request(500, "Failed to retrieve similarity scores")
def get_deserialized_filter(dp_id): """Cached, deserialized version. """ logger.debug("Getting filters") key = 'clk-pkl-{}'.format(dp_id) r = connect_to_redis(read_only=True) # Check if this dp_id is already saved in redis? if r.exists(key): logger.debug("returning filters from cache") return pickle.loads(r.get(key)) else: logger.debug("Looking up popcounts and filename from database") with DBConn() as db: serialized_filters_file, encoding_size = get_filter_metadata( db, dp_id) mc = connect_to_object_store() logger.debug("Getting filters from object store") # Note this uses already calculated popcounts unlike # serialization.deserialize_filters() raw_data_response = mc.get_object(config.MINIO_BUCKET, serialized_filters_file) python_filters = binary_unpack_filters( raw_data_response.stream(encoding_size)) set_deserialized_filter(dp_id, python_filters) return python_filters
def delete_minio_objects(filenames, project_id): log = logger.bind(pid=project_id) mc = connect_to_object_store() log.info(f"Deleting {len(filenames)} files from object store") try: mc.remove_objects(Config.MINIO_BUCKET, filenames) except MinioError as e: log.warning( f"Error occurred while removing object {filenames}. Ignoring.")
def upload_clk_data_binary(project_id, dp_id, raw_stream, count, size=128): """ Save the user provided raw CLK data. """ receipt_token = generate_code() filename = Config.BIN_FILENAME_FMT.format(receipt_token) # Set the state to 'pending' in the bloomingdata table with DBConn() as conn: db.insert_encoding_metadata(conn, filename, dp_id, receipt_token, count) db.update_encoding_metadata_set_encoding_size(conn, dp_id, size) logger.info( f"Storing supplied binary clks of individual size {size} in file: {filename}" ) num_bytes = count * (size + 6) logger.debug( "Directly storing binary file with index, base64 encoded CLK, popcount" ) # Upload to object store logger.info( f"Uploading {count} binary encodings to object store. Total size: {fmt_bytes(num_bytes)}" ) parent_span = g.flask_tracer.get_span() with opentracing.tracer.start_span('save-to-minio', child_of=parent_span) as span: mc = connect_to_object_store() try: mc.put_object(Config.MINIO_BUCKET, filename, data=raw_stream, length=num_bytes) except (minio.error.InvalidSizeError, minio.error.InvalidArgumentError, minio.error.ResponseError): logger.info( "Mismatch between expected stream length and header info") raise ValueError( "Mismatch between expected stream length and header info") with opentracing.tracer.start_span('update-database', child_of=parent_span) as span: with DBConn() as conn: db.update_encoding_metadata(conn, filename, dp_id, 'ready') db.set_dataprovider_upload_state(conn, dp_id, True) # Now work out if all parties have added their data if clks_uploaded_to_project(project_id): logger.info("All parties data present. Scheduling any queued runs") check_for_executable_runs.delay(project_id, serialize_span(parent_span)) return receipt_token
def get_chunk_from_object_store(chunk_info, encoding_size=128): mc = connect_to_object_store() bit_packed_element_size = binary_format(encoding_size).size chunk_length = chunk_info[2] - chunk_info[1] chunk_bytes = bit_packed_element_size * chunk_length chunk_stream = mc.get_partial_object( config.MINIO_BUCKET, chunk_info[0], bit_packed_element_size * chunk_info[1], chunk_bytes) chunk_data = binary_unpack_filters(chunk_stream, chunk_bytes, encoding_size) return chunk_data, chunk_length
def handle_raw_upload(project_id, dp_id, receipt_token, parent_span=None): """ User has uploaded base64 encodings as JSON, this task needs to copy the data into our internal binary format. """ log = logger.bind(pid=project_id, dp_id=dp_id) log.info("Handling user provided base64 encodings") new_child_span = lambda name: handle_raw_upload.tracer.start_active_span(name, child_of=handle_raw_upload.span) with DBConn() as db: if not check_project_exists(db, project_id): log.info("Project deleted, stopping immediately") return # Get number of blocks + total number of encodings from database expected_count, block_count = get_encoding_metadata(db, dp_id) log.info(f"Expecting to handle {expected_count} encodings in {block_count} blocks") mc = connect_to_object_store() input_filename = Config.RAW_FILENAME_FMT.format(receipt_token) raw_data = mc.get_object(Config.MINIO_BUCKET, input_filename) with new_child_span('upload-encodings-to-db'): # stream encodings with block ids from uploaded file # convert each encoding to our internal binary format # output into database for each block (temp or direct to minio?) encoding_size, pipeline = convert_encodings_from_base64_to_binary(stream_json_clksnblocks(raw_data)) log.info(f"Starting pipeline to store {encoding_size}B sized encodings in database") with DBConn() as db: store_encodings_in_db(db, dp_id, pipeline, encoding_size) log.info(f"Converted uploaded encodings of size {fmt_bytes(encoding_size)} into internal binary format. Number of blocks: {block_count}") # As this is the first time we've seen the encoding size actually uploaded from this data provider # We check it complies with the project encoding size. try: check_dataproviders_encoding(project_id, encoding_size) except InvalidEncodingError as e: log.warning(e.args[0]) handle_invalid_encoding_data(project_id, dp_id) with DBConn() as conn: with new_child_span('save-encoding-metadata'): # Save the encoding size as metadata for this data provider update_encoding_metadata_set_encoding_size(conn, dp_id, encoding_size) update_encoding_metadata(conn, None, dp_id, 'ready') # Now work out if all parties have added their data if clks_uploaded_to_project(project_id, check_data_ready=True): log.info("All parties' data present. Scheduling any queued runs") check_for_executable_runs.delay(project_id, handle_raw_upload.get_serialized_span())
def get_chunk_from_object_store(chunk_info, encoding_size=128): mc = connect_to_object_store() bit_packed_element_size = binary_format(encoding_size).size chunk_range_start, chunk_range_stop = chunk_info['range'] chunk_length = chunk_range_stop - chunk_range_start chunk_bytes = bit_packed_element_size * chunk_length chunk_stream = mc.get_partial_object( config.MINIO_BUCKET, chunk_info['storeFilename'], bit_packed_element_size * chunk_range_start, chunk_bytes) chunk_data = binary_unpack_filters( chunk_stream.stream(bit_packed_element_size), chunk_bytes, encoding_size) return chunk_data, chunk_length
def delete_minio_objects(filenames, project_id, parent_span=None): log = logger.bind(pid=project_id) mc = connect_to_object_store() log.info(f"Deleting {len(filenames)} files from object store") try: for del_err in mc.remove_objects(Config.MINIO_BUCKET, filenames): log.debug("Deletion error: {}".format(del_err)) except MinioError as e: log.warning( f"Error occurred while removing object {filenames}. Ignoring.") if Config.UPLOAD_OBJECT_STORE_ENABLED: log.debug("Deleting everything uploaded to object store for project") delete_object_store_folder(mc, Config.UPLOAD_OBJECT_STORE_BUCKET, f"{project_id}/")
def aggregate_comparisons(similarity_result_files, project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) if similarity_result_files is None: return mc = connect_to_object_store() files = [] data_size = 0 for num, filename in similarity_result_files: if num > 0: files.append(filename) data_size += mc.stat_object(Config.MINIO_BUCKET, filename).size log.debug("Aggregating result chunks from {} files, total size: {}".format( len(files), fmt_bytes(data_size))) result_file_stream_generator = (mc.get_object(Config.MINIO_BUCKET, result_filename) for result_filename in files) log.info("Similarity score results are {}".format(fmt_bytes(data_size))) result_stream = chain_streams(result_file_stream_generator) with DBConn() as db: result_type = get_project_column(db, project_id, 'result_type') # Note: Storing the similarity scores for all result types result_filename = store_similarity_scores(result_stream, run_id, data_size, db) if result_type == "similarity_scores": # Post similarity computation cleanup dp_ids = get_dataprovider_ids(db, project_id) else: # we promote the run to the next stage progress_stage(db, run_id) lenf1, lenf2 = get_project_dataset_sizes(db, project_id) # DB now committed, we can fire off tasks that depend on the new db state if result_type == "similarity_scores": log.info("Deleting intermediate similarity score files from object store") mc.remove_objects(Config.MINIO_BUCKET, files) log.debug("Removing clk filters from redis cache") remove_from_cache(dp_ids[0]) remove_from_cache(dp_ids[1]) # Complete the run log.info("Marking run as complete") mark_run_complete.delay(run_id, aggregate_comparisons.get_serialized_span()) else: solver_task.delay(result_filename, project_id, run_id, lenf1, lenf2, aggregate_comparisons.get_serialized_span())
def upload_json_clk_data(dp_id, clk_json, parent_span): """ Convert user provided encodings from json array of base64 data into a newline separated file of base64 data. Note this implementation is non-streaming. """ if 'clks' not in clk_json or len(clk_json['clks']) < 1: safe_fail_request(400, message="Missing CLKs information") receipt_token = generate_code() filename = Config.RAW_FILENAME_FMT.format(receipt_token) logger.info("Storing user {} supplied clks from json".format(dp_id)) with opentracing.tracer.start_span('clk-splitting', child_of=parent_span) as span: count = len(clk_json['clks']) span.set_tag("clks", count) data = b''.join(''.join(clk.split('\n')).encode() + b'\n' for clk in clk_json['clks']) num_bytes = len(data) span.set_tag("num_bytes", num_bytes) buffer = BytesIO(data) logger.info( f"Received {count} encodings. Uploading {fmt_bytes(num_bytes)} to object store" ) with opentracing.tracer.start_span('save-to-quarantine', child_of=parent_span) as span: span.set_tag('filename', filename) mc = connect_to_object_store() mc.put_object(Config.MINIO_BUCKET, filename, data=buffer, length=num_bytes) with opentracing.tracer.start_span('update-db', child_of=parent_span) as span: with DBConn() as conn: db.insert_encoding_metadata(conn, filename, dp_id, receipt_token, count) return receipt_token, filename
def test_temp_credentials_minio(self): upload_endpoint = Config.UPLOAD_OBJECT_STORE_SERVER bucket_name = "uploads" root_mc_client = connect_to_object_store() upload_restricted_minio_client = connect_to_upload_object_store() if not root_mc_client.bucket_exists(bucket_name): root_mc_client.make_bucket(bucket_name) with pytest.raises(minio.error.AccessDenied): upload_restricted_minio_client.list_buckets() # Should be able to put an object though upload_restricted_minio_client.put_object(bucket_name, 'testobject', io.BytesIO(b'data'), length=4) credentials_provider = AssumeRoleProvider( upload_restricted_minio_client, Policy=restricted_upload_policy) temp_creds = Credentials(provider=credentials_provider) newly_restricted_mc_client = Minio(upload_endpoint, credentials=temp_creds, region='us-east-1', secure=False) with pytest.raises(minio.error.AccessDenied): newly_restricted_mc_client.list_buckets() # Note this put object worked with the earlier credentials # But should fail if we have applied the more restrictive policy with pytest.raises(minio.error.AccessDenied): newly_restricted_mc_client.put_object(bucket_name, 'testobject2', io.BytesIO(b'data'), length=4) # this path is allowed in the policy however newly_restricted_mc_client.put_object(bucket_name, '2020/testobject', io.BytesIO(b'data'), length=4)
def solver_task(similarity_scores_filename, project_id, run_id, dataset_sizes, parent_span): log = logger.bind(pid=project_id, run_id=run_id) mc = connect_to_object_store() solver_task.span.log_kv({ 'datasetSizes': dataset_sizes, 'filename': similarity_scores_filename }) score_file = mc.get_object(config.MINIO_BUCKET, similarity_scores_filename) log.debug("Creating python sparse matrix from bytes data") candidate_pairs = anonlink.serialization.load_candidate_pairs(score_file) log.info("Calculating the optimal mapping from similarity matrix") groups = anonlink.solving.greedy_solve(candidate_pairs) log.info("Entity groups have been computed") res = {"groups": groups, "datasetSizes": dataset_sizes} save_and_permute.delay(res, project_id, run_id, solver_task.get_serialized_span())
def solver_task(similarity_scores_filename, project_id, run_id, dataset_sizes, parent_span): log = logger.bind(pid=project_id, run_id=run_id) mc = connect_to_object_store() solver_task.span.log_kv({ 'datasetSizes': dataset_sizes, 'filename': similarity_scores_filename }) score_file = mc.get_object(config.MINIO_BUCKET, similarity_scores_filename) log.debug("Creating python sparse matrix from bytes data") candidate_pairs_with_duplicates = anonlink.serialization.load_candidate_pairs( score_file) similarity_scores, (dset_is0, dset_is1), (rec_is0, rec_is1) = candidate_pairs_with_duplicates log.info( f"Number of candidate pairs before deduplication: {len(candidate_pairs_with_duplicates[0])}" ) if len(candidate_pairs_with_duplicates[0]) > 0: # TODO use public interface when available # https://github.com/data61/anonlink/issues/271 candidate_pairs = _merge_similarities( [zip(similarity_scores, dset_is0, dset_is1, rec_is0, rec_is1)], k=None) log.info( f"Number of candidate pairs after deduplication: {len(candidate_pairs[0])}" ) log.info("Calculating the optimal mapping from similarity matrix") groups = anonlink.solving.greedy_solve(candidate_pairs) else: groups = [] log.info("Entity groups have been computed") res = {"groups": groups, "datasetSizes": dataset_sizes} save_and_permute.delay(res, project_id, run_id, solver_task.get_serialized_span())
def solver_task(similarity_scores_filename, project_id, run_id, lenf1, lenf2, parent_span): log = logger.bind(pid=project_id, run_id=run_id) mc = connect_to_object_store() solver_task.span.log_kv({ 'lenf1': lenf1, 'lenf2': lenf2, 'filename': similarity_scores_filename }) score_file = mc.get_object(config.MINIO_BUCKET, similarity_scores_filename) log.debug("Creating python sparse matrix from bytes data") sparse_matrix = similarity_matrix_from_csv_bytes(score_file.data) log.info("Calculating the optimal mapping from similarity matrix") mapping = anonlink.entitymatch.greedy_solver(sparse_matrix) log.debug("Converting all indices to strings") for key in mapping: mapping[key] = str(mapping[key]) log.info("Entity mapping has been computed") res = {"mapping": mapping, "lenf1": lenf1, "lenf2": lenf2} save_and_permute.delay(res, project_id, run_id, solver_task.get_serialized_span())
def handle_encoding_upload_json(project_id, dp_id, clk_json, receipt_token, uses_blocking, parent_span): """ Take user provided upload information - accepting multiple formats - and eventually injest into the database. Encodings uploaded directly in the JSON are first quarantined in the object store, and a background task deserializes them. Encodings that are in an object store are streamed directly into the database by a background task. """ log = logger.bind(pid=project_id) log.info("Checking json is consistent") try: abort_if_inconsistent_upload(uses_blocking, clk_json) except ValueError as e: safe_fail_request(403, e.args[0]) if "encodings" in clk_json and 'file' in clk_json['encodings']: # external encodings log.info("External encodings uploaded") encoding_object_info = clk_json['encodings']['file'] object_name = encoding_object_info['path'] _check_object_path_allowed(project_id, dp_id, object_name, log) encoding_credentials = clk_json['encodings'].get('credentials') # Schedule a background task to pull the encodings from the object store # This background task updates the database with encoding metadata assuming # that there are no blocks. if 'blocks' not in clk_json: log.info("scheduling task to pull encodings from object store") pull_external_data_encodings_only.delay( project_id, dp_id, encoding_object_info, encoding_credentials, receipt_token, parent_span=serialize_span(parent_span)) else: # Need to deal with both encodings and blocks if 'file' in clk_json['blocks']: object_name = clk_json['blocks']['file']['path'] _check_object_path_allowed(project_id, dp_id, object_name, log) # Blocks are in an external file blocks_object_info = clk_json['blocks']['file'] blocks_credentials = clk_json['blocks'].get('credentials') log.info( "scheduling task to pull both encodings and blocking data from object store" ) pull_external_data.delay( project_id, dp_id, encoding_object_info, encoding_credentials, blocks_object_info, blocks_credentials, receipt_token, parent_span=serialize_span(parent_span)) else: raise NotImplementedError( "Don't currently handle combination of external encodings and blocks" ) return # Convert uploaded JSON to common schema. # # The original JSON API simply accepted "clks", then came a combined encoding and # blocking API expecting the top level element "clknblocks". Finally an API that # specifies both "encodings" and "blocks" independently at the top level. # # We rewrite all into the "clknblocks" format. if "encodings" in clk_json: logger.debug( "converting from 'encodings' & 'blocks' format to 'clknblocks'") clk_json = convert_encoding_upload_to_clknblock(clk_json) is_valid_clks = not uses_blocking and 'clks' in clk_json element = 'clks' if is_valid_clks else 'clknblocks' if len(clk_json[element]) < 1: safe_fail_request(400, message="Missing CLKs information") filename = Config.RAW_FILENAME_FMT.format(receipt_token) logger.info("Storing user {} supplied {} from json".format(dp_id, element)) with opentracing.tracer.start_span('splitting-json-clks', child_of=parent_span) as span: encoding_count = len(clk_json[element]) span.set_tag(element, encoding_count) logger.debug(f"Received {encoding_count} {element}") if element == 'clks': logger.info("Rewriting provided json into clknsblocks format") clk_json = convert_clks_to_clknblocks(clk_json) element = 'clknblocks' logger.info("Counting block sizes and number of blocks") # {'clknblocks': [['UG9vcA==', '001', '211'], [...]]} block_sizes = {} for _, *elements_blocks in clk_json[element]: for el_block in elements_blocks: block_sizes[el_block] = block_sizes.setdefault(el_block, 0) + 1 block_count = len(block_sizes) logger.info(f"Received {encoding_count} encodings in {block_count} blocks") for block in block_sizes: logger.info(f"Block {block} has {block_sizes[block]} elements") # write clk_json into a temp file tmp = tempfile.NamedTemporaryFile(mode='w') json.dump(clk_json, tmp) tmp.flush() with opentracing.tracer.start_span('save-clk-file-to-quarantine', child_of=parent_span) as span: span.set_tag('filename', filename) mc = connect_to_object_store() mc.fput_object(Config.MINIO_BUCKET, filename, tmp.name, content_type='application/json') logger.info('Saved uploaded {} JSON to file {} in object store.'.format( element.upper(), filename)) with opentracing.tracer.start_span('update-encoding-metadata', child_of=parent_span): with DBConn() as conn: db.insert_encoding_metadata(conn, filename, dp_id, receipt_token, encoding_count, block_count) db.insert_blocking_metadata(conn, dp_id, block_sizes) # Schedule a task to deserialize the encodings handle_raw_upload.delay(project_id, dp_id, receipt_token, parent_span=serialize_span(parent_span))
def handle_raw_upload(project_id, dp_id, receipt_token, parent_span=None): # User has uploaded base64 encodings as JSON log = logger.bind(pid=project_id, dp_id=dp_id) log.info("Handling user provided base64 encodings") with DBConn() as db: if not check_project_exists(db, project_id): log.info("Project deleted, stopping immediately") return expected_count = get_number_of_hashes(db, dp_id) log.info(f"Expecting to handle {expected_count} encodings") mc = connect_to_object_store() # Input file is line separated base64 record encodings. raw_file = Config.RAW_FILENAME_FMT.format(receipt_token) raw_data_response = mc.get_object(Config.MINIO_BUCKET, raw_file) # Set up streaming processing pipeline buffered_stream = iterable_to_stream(raw_data_response.stream()) text_stream = io.TextIOWrapper(buffered_stream, newline='\n') clkcounts = [] def filter_generator(): log.debug("Deserializing json filters") first_encoding_size = None for i, line in enumerate(text_stream): ba = deserialize_bitarray(line) yield (ba, i, ba.count()) clkcounts.append(ba.count()) encsize = len(ba) if i == 0: first_encoding_size = encsize if encsize != first_encoding_size: raise ValueError("Encodings were not all the same size") log.info(f"Processed {len(clkcounts)} hashes") # We peek at the first element as we need the encoding size # for the ret of our processing pipeline python_filters = more_itertools.peekable(filter_generator()) # Note the len of a bitarray is returned in bits but we require # this to be a multiple of 8 so we use bytes. uploaded_encoding_size = len(python_filters.peek()[0]) // 8 # This is the first time we've seen the encoding size from this data provider try: check_dataproviders_encoding(project_id, uploaded_encoding_size) except InvalidEncodingError as e: log.warning(e.args[0]) handle_invalid_encoding_data(project_id, dp_id) with DBConn() as db: # Save the encoding size as metadata update_encoding_metadata_set_encoding_size(db, dp_id, uploaded_encoding_size) # Output file is our custom binary packed file filename = Config.BIN_FILENAME_FMT.format(receipt_token) bit_packed_element_size = binary_format(uploaded_encoding_size).size num_bytes = expected_count * bit_packed_element_size # If small enough preload the data into our redis cache if expected_count < Config.ENTITY_CACHE_THRESHOLD: log.info("Caching pickled clk data") python_filters = list(python_filters) cache.set_deserialized_filter(dp_id, python_filters) else: log.info("Not caching clk data as it is too large") packed_filters = binary_pack_filters(python_filters, uploaded_encoding_size) packed_filter_stream = iterable_to_stream(packed_filters) # Upload to object store log.info( f"Uploading {expected_count} encodings of size {uploaded_encoding_size} " + f"to object store. Total Size: {fmt_bytes(num_bytes)}") mc.put_object(Config.MINIO_BUCKET, filename, data=packed_filter_stream, length=num_bytes) with DBConn() as conn: update_encoding_metadata(conn, filename, dp_id, 'ready') # Now work out if all parties have added their data if clks_uploaded_to_project(project_id, check_data_ready=True): log.info("All parties' data present. Scheduling any queued runs") check_for_executable_runs.delay( project_id, handle_raw_upload.get_serialized_span())
def aggregate_comparisons(similarity_result_files, project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) if similarity_result_files is None: raise TypeError("Inappropriate argument type - missing results files.") files = [] for res in similarity_result_files: if res is None: log.warning( "Missing results during aggregation. Stopping processing.") raise TypeError( "Inappropriate argument type - results missing at aggregation step." ) num, filesize, filename = res if num: assert filesize is not None assert filename is not None files.append((num, filesize, filename)) else: assert filesize is None assert filename is None heapq.heapify(files) log.debug(f"Aggregating result chunks from {len(files)} files, " f"total size: {sum(map(operator.itemgetter(1), files))}") mc = connect_to_object_store() while len(files) > 1: file0 = heapq.heappop(files) file1 = heapq.heappop(files) merged_file = _merge_files(mc, log, file0, file1) heapq.heappush(files, merged_file) if not files: # No results. Let's chuck in an empty file. empty_file = _put_placeholder_empty_file(mc, log) files.append(empty_file) (merged_num, merged_filesize, merged_filename), = files log.info(f"Similarity score results in {merged_filename} in bucket " f"{Config.MINIO_BUCKET} take up {merged_filesize} bytes.") with DBConn() as db: result_type = get_project_column(db, project_id, 'result_type') result_id = insert_similarity_score_file(db, run_id, merged_filename) log.debug(f"Saved path to similarity scores file to db with id " f"{result_id}") if result_type == "similarity_scores": # Post similarity computation cleanup dp_ids = get_dataprovider_ids(db, project_id) else: # we promote the run to the next stage progress_stage(db, run_id) dataset_sizes = get_project_dataset_sizes(db, project_id) # DB now committed, we can fire off tasks that depend on the new db state if result_type == "similarity_scores": log.debug("Removing clk filters from redis cache") for dp_id in dp_ids: remove_from_cache(dp_id) # Complete the run log.info("Marking run as complete") mark_run_complete.delay(run_id, aggregate_comparisons.get_serialized_span()) else: solver_task.delay(merged_filename, project_id, run_id, dataset_sizes, aggregate_comparisons.get_serialized_span())
def compute_filter_similarity(package, project_id, run_id, threshold, encoding_size, parent_span=None): """Compute filter similarity between a chunk of filters in dataprovider 1, and a chunk of filters in dataprovider 2. :param dict chunk_info: A chunk returned by ``anonlink.concurrency.split_to_chunks``. :param project_id: :param run_id: :param threshold: :param encoding_size: The size in bytes of each encoded entry :param parent_span: A serialized opentracing span context. :returns A 3-tuple: (num_results, result size in bytes, results_filename_in_object_store, ) """ log = logger.bind(pid=project_id, run_id=run_id) task_span = compute_filter_similarity.span def new_child_span(name, parent_scope=None): if parent_scope is None: parent_scope = compute_filter_similarity return compute_filter_similarity.tracer.start_active_span( name, child_of=parent_scope.span) log.debug(f"Computing similarities for {len(package)} chunks of filters") log.debug( "Checking that the resource exists (in case of run being canceled/deleted)" ) assert_valid_run(project_id, run_id, log) #chunk_info_dp1, chunk_info_dp2 = chunk_info def reindex_using_encoding_ids(recordarray, encoding_id_list): # Map results from "index in chunk" to encoding id. return array.array('I', [encoding_id_list[i] for i in recordarray]) num_results = 0 num_comparisons = 0 sim_results = [] with DBConn() as conn: if len(package) > 1: # multiple full blocks in one package with new_child_span( f'fetching-encodings of package of size {len(package)}'): package = get_encoding_chunks(conn, package, encoding_size=encoding_size) else: # this chunk is all part of one block with new_child_span(f'fetching-encodings of package with 1 chunk'): chunk_info_dp1, chunk_info_dp2 = package[0] chunk_with_ids_dp1, chunk_dp1_size = get_encoding_chunk( conn, chunk_info_dp1, encoding_size) entity_ids_dp1, chunk_dp1 = zip(*chunk_with_ids_dp1) chunk_info_dp1['encodings'] = chunk_dp1 chunk_info_dp1['entity_ids'] = entity_ids_dp1 chunk_with_ids_dp2, chunk_dp2_size = get_encoding_chunk( conn, chunk_info_dp2, encoding_size) entity_ids_dp2, chunk_dp2 = zip(*chunk_with_ids_dp2) chunk_info_dp2['encodings'] = chunk_dp2 chunk_info_dp2['entity_ids'] = entity_ids_dp2 log.debug('All encodings for package are fetched and deserialized') log.debug("Calculating filter similarities for work package") with new_child_span('comparing-encodings') as parent_scope: for chunk_dp1, chunk_dp2 in package: enc_dp1 = chunk_dp1['encodings'] enc_dp1_size = len(enc_dp1) enc_dp2 = chunk_dp2['encodings'] enc_dp2_size = len(enc_dp2) assert enc_dp1_size > 0, "Zero sized chunk in dp1" assert enc_dp2_size > 0, "Zero sized chunk in dp2" try: sims, (rec_is0, rec_is1 ) = anonlink.similarities.dice_coefficient_accelerated( datasets=(enc_dp1, enc_dp2), threshold=threshold, k=min(enc_dp1_size, enc_dp2_size)) except NotImplementedError as e: log.warning( f"Encodings couldn't be compared using anonlink. {e}") return rec_is0 = reindex_using_encoding_ids(rec_is0, chunk_dp1['entity_ids']) rec_is1 = reindex_using_encoding_ids(rec_is1, chunk_dp2['entity_ids']) num_results += len(sims) num_comparisons += enc_dp1_size * enc_dp2_size sim_results.append( (sims, (rec_is0, rec_is1), chunk_dp1['datasetIndex'], chunk_dp2['datasetIndex'])) log.debug( f'comparison is done. {num_comparisons} comparisons got {num_results} pairs above the threshold' ) ##### progess reporting log.debug('Encoding similarities calculated') with new_child_span('update-comparison-progress') as scope: # Update the number of comparisons completed save_current_progress(num_comparisons, run_id) scope.span.log_kv({ 'comparisons': num_comparisons, 'num_similar': num_results }) log.debug("Comparisons: {}, Links above threshold: {}".format( num_comparisons, num_results)) ###### results into file into minio with new_child_span('save-comparison-results-to-minio'): file_iters = [] file_sizes = [] for sims, (rec_is0, rec_is1), dp1_ds_idx, dp2_ds_idx in sim_results: num_sims = len(sims) if num_sims: # Make index arrays for serialization index_1 = array.array('I', (dp1_ds_idx, )) * num_sims index_2 = array.array('I', (dp2_ds_idx, )) * num_sims chunk_results = sims, (index_1, index_2), (rec_is0, rec_is1), bytes_iter, file_size \ = anonlink.serialization.dump_candidate_pairs_iter(chunk_results) file_iters.append(iterable_to_stream(bytes_iter)) file_sizes.append(file_size) if len(file_iters) > 1: # we need to merge them first into one ordered thingy merged_file_iter, merged_file_size \ = anonlink.serialization.merge_streams_iter(file_iters, sizes=file_sizes) merged_file_iter = iterable_to_stream(merged_file_iter) elif len(file_iters) == 1: merged_file_iter = file_iters[0] merged_file_size = file_sizes[0] else: return 0, None, None result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format( generate_code(12)) task_span.log_kv({"edges": num_results}) log.info("Writing {} intermediate results to file: {}".format( num_results, result_filename)) mc = connect_to_object_store() try: mc.put_object(Config.MINIO_BUCKET, result_filename, merged_file_iter, merged_file_size) except minio.ResponseError as err: log.warning("Failed to store result in minio: {}".format(err)) raise return num_results, merged_file_size, result_filename
def pull_external_data(project_id, dp_id, encoding_object_info, blocks_object_info, receipt_token, parent_span=None): """ Load encoding and blocking data from object store. - pull blocking map into memory, create blocks in db - stream encodings into DB and add encoding + blocks from in memory dict. :param project_id: identifier for the project :param dp_id: :param encoding_object_info: a dictionary contains bucket and path of uploaded encoding :param blocks_object_info: a dictionary contains bucket and path of uploaded blocks :param receipt_token: token used to insert into database """ env_credentials = parse_minio_credentials({ 'AccessKeyId': config.MINIO_ACCESS_KEY, 'SecretAccessKey': config.MINIO_SECRET_KEY }) log = logger.bind(pid=project_id, dp_id=dp_id) with DBConn() as conn: if not check_project_exists(conn, project_id): log.info("Project deleted, stopping immediately") return mc = connect_to_object_store(env_credentials) log.debug("Pulling blocking information from object store") response = mc.get_object(bucket_name=blocks_object_info['bucket'], object_name=blocks_object_info['path']) encoding_to_block_map = json.load(response)['blocks'] log.debug("Counting the blocks") block_sizes = {} for encoding_id in encoding_to_block_map: _blocks = encoding_to_block_map[encoding_id] for block_id in _blocks: block_id = str(block_id) block_sizes[block_id] = block_sizes.setdefault(block_id, 0) + 1 block_count = len(block_sizes) log.debug(f"Processing {block_count} blocks") # stream the encodings bucket_name = encoding_object_info['bucket'] object_name = encoding_object_info['path'] stat, encodings_stream = stat_and_stream_object(bucket_name, object_name, env_credentials) count = int(stat.metadata['X-Amz-Meta-Hash-Count']) size = int(stat.metadata['X-Amz-Meta-Hash-Size']) log.debug(f"Processing {count} encodings of size {size}") assert count == len(encoding_to_block_map), f"Expected {count} encodings in blocks got {len(encoding_to_block_map)}" with DBConn() as conn: with opentracing.tracer.start_span('update-metadata-db', child_of=parent_span): insert_encoding_metadata(conn, None, dp_id, receipt_token, encoding_count=count, block_count=block_count) update_encoding_metadata_set_encoding_size(conn, dp_id, size) with opentracing.tracer.start_span('create-block-entries-in-db', child_of=parent_span): log.debug("Adding blocks to db") insert_blocking_metadata(conn, dp_id, block_sizes) def ijson_encoding_iterator(encoding_stream): binary_formatter = binary_format(size) for encoding_id, encoding in zip(range(count), encoding_stream): yield ( str(encoding_id), binary_formatter.pack(encoding_id, deserialize_bytes(encoding)), encoding_to_block_map[str(encoding_id)] ) def encoding_iterator(encoding_stream): binary_formatter = binary_format(size) for encoding_id in range(count): yield ( str(encoding_id), binary_formatter.pack(encoding_id, encoding_stream.read(size)), encoding_to_block_map[str(encoding_id)] ) if object_name.endswith('.json'): encodings_stream = ijson.items(io.BytesIO(encodings_stream.data), 'clks.item') encoding_generator = ijson_encoding_iterator(encodings_stream) else: encoding_generator = encoding_iterator(encodings_stream) with opentracing.tracer.start_span('upload-encodings-to-db', child_of=parent_span): log.debug("Adding encodings and associated blocks to db") try: store_encodings_in_db(conn, dp_id, encoding_generator, size) except Exception as e: update_dataprovider_uploaded_state(conn, project_id, dp_id, 'error') log.warning(e) with opentracing.tracer.start_span('update-encoding-metadata', child_of=parent_span): update_encoding_metadata(conn, None, dp_id, 'ready') update_blocks_state(conn, dp_id, block_sizes.keys(), 'ready') # # Now work out if all parties have added their data if clks_uploaded_to_project(project_id): logger.info("All parties data present. Scheduling any queued runs") check_for_executable_runs.delay(project_id, serialize_span(parent_span))
def compute_filter_similarity(chunk_info, project_id, run_id, threshold, encoding_size, parent_span=None): """Compute filter similarity between a chunk of filters in dataprovider 1, and a chunk of filters in dataprovider 2. :param dict chunk_info: A chunk returned by ``anonlink.concurrency.split_to_chunks``. :param project_id: :param run_id: :param threshold: :param encoding_size: The size in bytes of each encoded entry :param parent_span: A serialized opentracing span context. :returns A 3-tuple: (num_results, result size in bytes, results_filename_in_object_store, ) """ log = logger.bind(pid=project_id, run_id=run_id) task_span = compute_filter_similarity.span def new_child_span(name, parent_scope=None): if parent_scope is None: parent_scope = compute_filter_similarity return compute_filter_similarity.tracer.start_active_span( name, child_of=parent_scope.span) log.debug("Computing similarity for a chunk of filters") log.debug( "Checking that the resource exists (in case of run being canceled/deleted)" ) assert_valid_run(project_id, run_id, log) chunk_info_dp1, chunk_info_dp2 = chunk_info with DBConn() as conn: with new_child_span('fetching-encodings') as parent_scope: with new_child_span('fetching-left-encodings', parent_scope): log.debug( "Fetching and deserializing chunk of filters for dataprovider 1" ) chunk_with_ids_dp1, chunk_dp1_size = get_encoding_chunk( conn, chunk_info_dp1, encoding_size) entity_ids_dp1, chunk_dp1 = zip(*chunk_with_ids_dp1) with new_child_span('fetching-right-encodings', parent_scope): log.debug( "Fetching and deserializing chunk of filters for dataprovider 2" ) chunk_with_ids_dp2, chunk_dp2_size = get_encoding_chunk( conn, chunk_info_dp2, encoding_size) entity_ids_dp2, chunk_dp2 = zip(*chunk_with_ids_dp2) log.debug('Both chunks are fetched and deserialized') task_span.log_kv({ 'size1': chunk_dp1_size, 'size2': chunk_dp2_size, 'chunk_info': chunk_info }) assert chunk_dp1_size > 0, "Zero sized chunk in dp1" assert chunk_dp2_size > 0, "Zero sized chunk in dp2" with new_child_span('comparing-encodings') as parent_scope: log.debug("Calculating filter similarity") with new_child_span('dice-call', parent_scope): try: sims, (rec_is0, rec_is1 ) = anonlink.similarities.dice_coefficient_accelerated( datasets=(chunk_dp1, chunk_dp2), threshold=threshold, k=min(chunk_dp1_size, chunk_dp2_size)) except NotImplementedError as e: log.warning("Encodings couldn't be compared using anonlink.") return with new_child_span('reindex-call', parent_scope): def reindex_using_encoding_ids(recordarray, encoding_id_list): # Map results from "index in chunk" to encoding id. return array.array('I', [encoding_id_list[i] for i in recordarray]) rec_is0 = reindex_using_encoding_ids(rec_is0, entity_ids_dp1) rec_is1 = reindex_using_encoding_ids(rec_is1, entity_ids_dp2) log.debug('Encoding similarities calculated') with new_child_span('update-comparison-progress') as scope: # Update the number of comparisons completed comparisons_computed = chunk_dp1_size * chunk_dp2_size save_current_progress(comparisons_computed, run_id) scope.span.log_kv({ 'comparisons': comparisons_computed, 'num_similar': len(sims) }) log.debug("Comparisons: {}, Links above threshold: {}".format( comparisons_computed, len(sims))) with new_child_span('save-comparison-results-to-minio'): num_results = len(sims) if num_results: result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format( generate_code(12)) task_span.log_kv({"edges": num_results}) log.info("Writing {} intermediate results to file: {}".format( num_results, result_filename)) # Make index arrays for serialization index_1 = array.array( 'I', (chunk_info_dp1["datasetIndex"], )) * num_results index_2 = array.array( 'I', (chunk_info_dp2["datasetIndex"], )) * num_results chunk_results = sims, (index_1, index_2), (rec_is0, rec_is1), bytes_iter, file_size \ = anonlink.serialization.dump_candidate_pairs_iter(chunk_results) iter_stream = iterable_to_stream(bytes_iter) mc = connect_to_object_store() try: mc.put_object(Config.MINIO_BUCKET, result_filename, iter_stream, file_size) except minio.ResponseError as err: log.warning("Failed to store result in minio") raise else: result_filename = None file_size = None return num_results, file_size, result_filename
def pull_external_data(project_id, dp_id, encoding_object_info, encoding_credentials, blocks_object_info, blocks_credentials, receipt_token, parent_span=None): """ Load encoding and blocking data from object store. - pull blocking map into memory, create blocks in db - stream encodings into DB and add encoding + blocks from in memory dict. """ log = logger.bind(pid=project_id, dp_id=dp_id) with DBConn() as conn: if not check_project_exists(conn, project_id): log.info("Project deleted, stopping immediately") return mc = connect_to_object_store(parse_minio_credentials(blocks_credentials)) log.debug("Pulling blocking information from object store") response = mc.get_object(bucket_name=blocks_object_info['bucket'], object_name=blocks_object_info['path']) encoding_to_block_map = json.load(response) log.debug("Counting the blocks") block_sizes = {} for encoding_id in encoding_to_block_map: _blocks = encoding_to_block_map[encoding_id] for block_id in _blocks: block_id = str(block_id) block_sizes[block_id] = block_sizes.setdefault(block_id, 0) + 1 block_count = len(block_sizes) log.debug(f"Processing {block_count} blocks") # stream the encodings bucket_name = encoding_object_info['bucket'] object_name = encoding_object_info['path'] stat, encodings_stream = stat_and_stream_object(bucket_name, object_name, parse_minio_credentials(encoding_credentials)) count = int(stat.metadata['X-Amz-Meta-Hash-Count']) size = int(stat.metadata['X-Amz-Meta-Hash-Size']) log.debug(f"Processing {count} encodings of size {size}") assert count == len(encoding_to_block_map), f"Expected {count} encodings in blocks got {len(encoding_to_block_map)}" with DBConn() as conn: with opentracing.tracer.start_span('update-metadata-db', child_of=parent_span): insert_encoding_metadata(conn, None, dp_id, receipt_token, encoding_count=count, block_count=block_count) update_encoding_metadata_set_encoding_size(conn, dp_id, size) with opentracing.tracer.start_span('create-block-entries-in-db', child_of=parent_span): log.debug("Adding blocks to db") insert_blocking_metadata(conn, dp_id, block_sizes) def encoding_iterator(encoding_stream): binary_formatter = binary_format(size) for encoding_id in range(count): yield ( str(encoding_id), binary_formatter.pack(encoding_id, encoding_stream.read(size)), encoding_to_block_map[str(encoding_id)] ) with opentracing.tracer.start_span('upload-encodings-to-db', child_of=parent_span): log.debug("Adding encodings and associated blocks to db") try: store_encodings_in_db(conn, dp_id, encoding_iterator(encodings_stream), size) except Exception as e: update_dataprovider_uploaded_state(conn, project_id, dp_id, 'error') log.warning(e) with opentracing.tracer.start_span('update-encoding-metadata', child_of=parent_span): update_encoding_metadata(conn, None, dp_id, 'ready') update_blocks_state(conn, dp_id, block_sizes.keys(), 'ready')
def compute_filter_similarity(chunk_info, project_id, run_id, threshold, encoding_size, parent_span=None): """Compute filter similarity between a chunk of filters in dataprovider 1, and a chunk of filters in dataprovider 2. :param chunk_info: Chunk info returned by ``anonlink.concurrency.split_to_chunks``. Additionally, "storeFilename" is added to each dataset chunk. :param project_id: :param threshold: :param encoding_size: The size in bytes of each encoded entry :param parent_span: A serialized opentracing span context. @returns A 2-tuple: (num_results, results_filename_in_object_store) """ log = logger.bind(pid=project_id, run_id=run_id) log.debug("Computing similarity for a chunk of filters") span = compute_filter_similarity.span log.debug( "Checking that the resource exists (in case of job being canceled)") with DBConn() as db: if not check_project_exists(db, project_id) or not check_run_exists( db, project_id, run_id): log.info("Failing task as project or run not found in database.") raise DBResourceMissing("project or run not found in database") chunk_info_dp1, chunk_info_dp2 = chunk_info t0 = time.time() log.debug("Fetching and deserializing chunk of filters for dataprovider 1") chunk_dp1, chunk_dp1_size = get_chunk_from_object_store( chunk_info_dp1, encoding_size) t1 = time.time() log.debug("Fetching and deserializing chunk of filters for dataprovider 2") chunk_dp2, chunk_dp2_size = get_chunk_from_object_store( chunk_info_dp2, encoding_size) t2 = time.time() span.log_kv({'event': 'chunks are fetched and deserialized'}) log.debug("Calculating filter similarity") span.log_kv({'size1': chunk_dp1_size, 'size2': chunk_dp2_size}) chunk_results = anonlink.concurrency.process_chunk( chunk_info, (chunk_dp1, chunk_dp2), anonlink.similarities.dice_coefficient_accelerated, threshold, k=min(chunk_dp1_size, chunk_dp2_size)) t3 = time.time() span.log_kv({'event': 'similarities calculated'}) # Update the number of comparisons completed comparisons_computed = chunk_dp1_size * chunk_dp2_size save_current_progress(comparisons_computed, run_id) t4 = time.time() sims, _, _ = chunk_results num_results = len(sims) if num_results: result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format( generate_code(12)) log.info("Writing {} intermediate results to file: {}".format( num_results, result_filename)) bytes_iter, file_size \ = anonlink.serialization.dump_candidate_pairs_iter(chunk_results) iter_stream = iterable_to_stream(bytes_iter) mc = connect_to_object_store() try: mc.put_object(Config.MINIO_BUCKET, result_filename, iter_stream, file_size) except minio.ResponseError as err: log.warning("Failed to store result in minio") raise else: result_filename = None file_size = None t5 = time.time() log.info("run={} Comparisons: {}, Links above threshold: {}".format( run_id, comparisons_computed, len(chunk_results))) log.info( "Prep: {:.3f} + {:.3f}, Solve: {:.3f}, Progress: {:.3f}, Save: {:.3f}, Total: {:.3f}" .format(t1 - t0, t2 - t1, t3 - t2, t4 - t3, t5 - t4, t5 - t0)) return num_results, file_size, result_filename
def compute_filter_similarity(chunk_info_dp1, chunk_info_dp2, project_id, run_id, threshold, encoding_size, parent_span=None): """Compute filter similarity between a chunk of filters in dataprovider 1, and a chunk of filters in dataprovider 2. :param chunk_info_dp1: A tuple containing: - object store filename - Chunk start index - Chunk stop index :param chunk_info_dp2: :param project_id: :param threshold: :param encoding_size: The size in bytes of each encoded entry :param parent_span: A serialized opentracing span context. """ log = logger.bind(pid=project_id, run_id=run_id) log.debug("Computing similarity for a chunk of filters") span = compute_filter_similarity.span log.debug("Checking that the resource exists (in case of job being canceled)") with DBConn() as db: if not check_project_exists(db, project_id) or not check_run_exists(db, project_id, run_id): log.info("Stopping as project or run not found in database.") return None t0 = time.time() log.debug("Fetching and deserializing chunk of filters for dataprovider 1") chunk_dp1, chunk_dp1_size = get_chunk_from_object_store(chunk_info_dp1, encoding_size) t1 = time.time() log.debug("Fetching and deserializing chunk of filters for dataprovider 2") chunk_dp2, chunk_dp2_size = get_chunk_from_object_store(chunk_info_dp2, encoding_size) t2 = time.time() span.log_kv({'event': 'chunks are fetched and deserialized'}) log.debug("Calculating filter similarity") span.log_kv({'size1': chunk_dp1_size, 'size2': chunk_dp2_size}) chunk_results = anonlink.entitymatch.calculate_filter_similarity(chunk_dp1, chunk_dp2, threshold=threshold, k=min(chunk_dp1_size, chunk_dp2_size), use_python=False) t3 = time.time() span.log_kv({'event': 'similarities calculated'}) # Update the number of comparisons completed comparisons_computed = chunk_dp1_size * chunk_dp2_size save_current_progress(comparisons_computed, run_id) t4 = time.time() partial_sparse_result = [] # offset chunk's index offset_dp1 = chunk_info_dp1[1] offset_dp2 = chunk_info_dp2[1] log.debug("Offset DP1 by: {}, DP2 by: {}".format(offset_dp1, offset_dp2)) for (ia, score, ib) in chunk_results: partial_sparse_result.append((ia + offset_dp1, ib + offset_dp2, score)) t5 = time.time() num_results = len(partial_sparse_result) if num_results > 0: result_filename = 'chunk-res-{}.csv'.format(generate_code(12)) log.info("Writing {} intermediate results to file: {}".format(num_results, result_filename)) with open(result_filename, 'wt') as f: csvwriter = csv.writer(f) csvwriter.writerows(partial_sparse_result) # Now write these to the object store. and return the filename and summary # Will write a csv file for now mc = connect_to_object_store() try: mc.fput_object(Config.MINIO_BUCKET, result_filename, result_filename) except minio.ResponseError as err: log.warning("Failed to store result in minio") raise # If we don't delete the file we *do* run out of space os.remove(result_filename) else: result_filename = None t6 = time.time() log.info("run={} Comparisons: {}, Links above threshold: {}".format(run_id, comparisons_computed, len(chunk_results))) log.info("Prep: {:.3f} + {:.3f}, Solve: {:.3f}, Progress: {:.3f}, Offset: {:.3f}, Save: {:.3f}, Total: {:.3f}".format( t1 - t0, t2 - t1, t3 - t2, t4 - t3, t4 - t4, t6 - t5, t6 - t0) ) return num_results, result_filename