def run_failed_handler(*args, **kwargs): """ Record that a task has encountered an error, mark the run as failed. :param args: A 1-tuple starting with the result id. :param kwargs: Keyword arguments to the task e.g. {'run_id': '...', } """ task_id = args[0] if 'run_id' in kwargs: logger.bind(run_id=kwargs['run_id']) logger.info("An error occurred while processing task", task_id=task_id) with DBConn() as db: update_run_mark_failure(db, kwargs['run_id']) logger.warning("Marked run as failure")
def check_for_executable_runs(project_id, parent_span=None): """ This is called when a run is posted (if project is ready for runs), and also after all dataproviders have uploaded CLKs, and the CLKS are ready. """ log = logger.bind(pid=project_id) log.debug("Checking for runs that need to be executed") if not clks_uploaded_to_project(project_id, check_data_ready=True): return with DBConn() as conn: try: check_and_set_project_encoding_size(project_id, conn) except ValueError as e: log.warning(e.args[0]) # make sure this error can be exposed to user by marking the run/s as failed update_project_mark_all_runs_failed(conn, project_id) return new_runs = get_created_runs_and_queue(conn, project_id) log.debug("Progressing run stages") for qr in new_runs: # Record that the run has reached a new stage run_id = qr[0] progress_stage(conn, run_id) # commit db changes before scheduling following tasks log.debug("Creating tasks for {} created runs for project {}".format(len(new_runs), project_id)) for qr in new_runs: run_id = qr[0] log.info('Queueing run for computation', run_id=run_id) prerun_check.delay(project_id, run_id, check_for_executable_runs.get_serialized_span())
def check_and_set_project_encoding_size(project_id, conn): # Check for consistency between uploaded encodings and commit to a # project encoding size if one wasn't provided in the linkage schema log = logger.bind(pid=project_id) uploaded_encoding_sizes = get_uploaded_encoding_sizes(conn, project_id) first_uploaded_size = uploaded_encoding_sizes[0][1] schema_encoding_size = get_project_schema_encoding_size(conn, project_id) project_encoding_size = get_project_encoding_size(conn, project_id) # In order of preference: encoding_size = project_encoding_size or schema_encoding_size or first_uploaded_size log.debug(f"Uploaded encoding sizes: {uploaded_encoding_sizes}") log.debug(f"Encoding size set in schema: {schema_encoding_size}") log.debug(f"Project encoding size: {project_encoding_size}") log.info(f"Verifying uploads all have encoding size of {encoding_size} bytes.") for dp_id, enc_size in uploaded_encoding_sizes: if enc_size != encoding_size: log.warning(f"Set the encodings' upload state to error for dp={dp_id} and aborting processing") handle_invalid_encoding_data(project_id, dp_id) raise ValueError("Mismatch in encoding sizes. Stopping") if project_encoding_size is None: set_project_encoding_size(conn, project_id, encoding_size) if not config.MIN_ENCODING_SIZE <= encoding_size <= config.MAX_ENCODING_SIZE: # Set all uploads to error state for dp_id, _ in uploaded_encoding_sizes: handle_invalid_encoding_data(project_id, dp_id) raise ValueError("Encoding size out of configured bounds") if encoding_size % 8: raise ValueError("Encoding size must be multiple of 8 bytes (64 bits)")
def save_and_permute(similarity_result, project_id, run_id, parent_span): log = logger.bind(pid=project_id, run_id=run_id) log.debug("Saving and possibly permuting data") mapping = similarity_result['mapping'] # Note Postgres requires JSON object keys to be strings # Celery actually converts the json arguments in the same way with DBConn() as db: result_type = get_project_column(db, project_id, 'result_type') # Just save the raw "mapping" log.debug("Saving the resulting map data to the db") result_id = insert_mapping_result(db, run_id, mapping) dp_ids = get_dataprovider_ids(db, project_id) log.info("Mapping result saved to db with result id {}".format(result_id)) if result_type == "permutations": log.debug("Submitting job to permute mapping") permute_mapping_data.apply_async( (project_id, run_id, similarity_result['lenf1'], similarity_result['lenf2'], save_and_permute.get_serialized_span())) else: log.debug("Mark mapping job as complete") mark_run_complete.delay(run_id, save_and_permute.get_serialized_span()) # Post similarity computation cleanup log.debug("Removing clk filters from redis cache") for dp_id in dp_ids: cache.remove_from_cache(dp_id) calculate_comparison_rate.delay()
def create_comparison_jobs(project_id, run_id, parent_span=None): """Schedule all the entity comparisons as sub tasks for a run. At a high level this task: - checks if the project and run have been deleted and if so aborts. - retrieves metadata: the number and size of the datasets, the encoding size, and the number and size of blocks. - splits the work into independent "chunks" and schedules them to run in celery - schedules the follow up task to run after all the comparisons have been computed. """ log = logger.bind(pid=project_id, run_id=run_id) current_span = create_comparison_jobs.span with DBConn() as conn: check_run_active(conn, project_id, run_id) dp_ids = get_dataprovider_ids(conn, project_id) number_of_datasets = len(dp_ids) assert number_of_datasets >= 2, "Expected at least 2 data providers" log.info(f"Scheduling comparison of CLKs from data provider ids: " f"{', '.join(map(str, dp_ids))}") # Retrieve required metadata dataset_sizes, dp_block_sizes = _retrieve_blocked_dataset_sizes( conn, project_id, dp_ids) log.info("Finding blocks in common between dataproviders") common_blocks = _get_common_blocks(dp_block_sizes, dp_ids) # We pass the encoding_size and threshold to the comparison tasks to minimize their db lookups encoding_size = get_project_encoding_size(conn, project_id) threshold = get_run(conn, run_id)['threshold'] log.debug("Chunking computation task") # Create "chunks" of comparisons chunks = _create_work_chunks(common_blocks, dp_block_sizes, dp_ids, log) log.info(f"Chunking into {len(chunks)} computation tasks") current_span.log_kv({ "event": "chunking", 'num_chunks': len(chunks), 'dataset-sizes': dataset_sizes }) span_serialized = create_comparison_jobs.get_serialized_span() # Prepare the Celery Chord that will compute all the similarity scores: scoring_tasks = [ compute_filter_similarity.si(chunk_info, project_id, run_id, threshold, encoding_size, span_serialized) for chunk_info in chunks ] if len(scoring_tasks) == 1: scoring_tasks.append(celery_bug_fix.si()) callback_task = aggregate_comparisons.s( project_id=project_id, run_id=run_id, parent_span=span_serialized).on_error( run_failed_handler.s(run_id=run_id)) log.info(f"Scheduling comparison tasks") future = chord(scoring_tasks)(callback_task)
def mark_run_complete(run_id, parent_span=None): log = logger.bind(run_id=run_id) log.debug("Marking run complete") with DBConn() as db: update_run_mark_complete(db, run_id) calculate_comparison_rate.delay() log.info("Run marked as complete")
def mark_run_complete(run_id, parent_span=None): log = logger.bind(run_id=run_id) log.debug("Marking run complete") with DBConn() as db: update_run_mark_complete(db, run_id) set_run_state_complete(run_id) log.info("Run marked as complete")
def prerun_check(project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) log.debug("Sanity check that we need to compute run") with DBConn() as conn: if not check_project_exists(conn, project_id): log.debug("Project not found. Skipping") raise ProjectDeleted(project_id) res = get_run(conn, run_id) if res is None: log.debug(f"Run not found. Skipping") raise RunDeleted(run_id) try: state = get_run_state_for_update(conn, run_id) except psycopg2.OperationalError: log.warning("Run started in another task. Skipping this race.") return if state in {'running', 'completed', 'error'}: log.warning("Run already started. Skipping") return log.debug("Setting run as in progress") update_run_set_started(conn, run_id) log.debug("Getting dp ids for compute similarity task") dp_ids = get_dataprovider_ids(conn, project_id) log.debug("Data providers: {}".format(dp_ids)) create_comparison_jobs.delay(project_id, run_id, prerun_check.get_serialized_span()) log.info("CLK similarity computation scheduled")
def delete_minio_objects(filenames, project_id): log = logger.bind(pid=project_id) mc = connect_to_object_store() log.info(f"Deleting {len(filenames)} files from object store") try: mc.remove_objects(Config.MINIO_BUCKET, filenames) except MinioError as e: log.warning( f"Error occurred while removing object {filenames}. Ignoring.")
def delete_minio_objects(filenames, project_id, parent_span=None): log = logger.bind(pid=project_id) mc = connect_to_object_store() log.info(f"Deleting {len(filenames)} files from object store") try: for del_err in mc.remove_objects(Config.MINIO_BUCKET, filenames): log.debug("Deletion error: {}".format(del_err)) except MinioError as e: log.warning( f"Error occurred while removing object {filenames}. Ignoring.") if Config.UPLOAD_OBJECT_STORE_ENABLED: log.debug("Deleting everything uploaded to object store for project") delete_object_store_folder(mc, Config.UPLOAD_OBJECT_STORE_BUCKET, f"{project_id}/")
def aggregate_comparisons(similarity_result_files, project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) if similarity_result_files is None: return mc = connect_to_object_store() files = [] data_size = 0 for num, filename in similarity_result_files: if num > 0: files.append(filename) data_size += mc.stat_object(Config.MINIO_BUCKET, filename).size log.debug("Aggregating result chunks from {} files, total size: {}".format( len(files), fmt_bytes(data_size))) result_file_stream_generator = (mc.get_object(Config.MINIO_BUCKET, result_filename) for result_filename in files) log.info("Similarity score results are {}".format(fmt_bytes(data_size))) result_stream = chain_streams(result_file_stream_generator) with DBConn() as db: result_type = get_project_column(db, project_id, 'result_type') # Note: Storing the similarity scores for all result types result_filename = store_similarity_scores(result_stream, run_id, data_size, db) if result_type == "similarity_scores": # Post similarity computation cleanup dp_ids = get_dataprovider_ids(db, project_id) else: # we promote the run to the next stage progress_stage(db, run_id) lenf1, lenf2 = get_project_dataset_sizes(db, project_id) # DB now committed, we can fire off tasks that depend on the new db state if result_type == "similarity_scores": log.info("Deleting intermediate similarity score files from object store") mc.remove_objects(Config.MINIO_BUCKET, files) log.debug("Removing clk filters from redis cache") remove_from_cache(dp_ids[0]) remove_from_cache(dp_ids[1]) # Complete the run log.info("Marking run as complete") mark_run_complete.delay(run_id, aggregate_comparisons.get_serialized_span()) else: solver_task.delay(result_filename, project_id, run_id, lenf1, lenf2, aggregate_comparisons.get_serialized_span())
def prerun_check(project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) log.debug("Sanity check that we need to compute run") # being very defensive here checking if the run state is already in the redis cache if not is_run_missing(run_id): log.warning( "unexpectedly the run state is present in redis before starting") return with DBConn() as conn: if not check_project_exists(conn, project_id): log.debug("Project not found. Skipping") raise ProjectDeleted(project_id) res = get_run(conn, run_id) if res is None: log.debug(f"Run not found. Skipping") raise RunDeleted(run_id) try: db_state = get_run_state_for_update(conn, run_id) except psycopg2.OperationalError: log.warning("Run started in another task. Skipping this race.") return if db_state in {'running', 'completed', 'error'}: log.warning("Run already started. Skipping") return log.debug("Setting run state in db as 'running'") update_run_set_started(conn, run_id) log.debug("Updating redis cache for run") set_run_state_active(run_id) create_comparison_jobs.apply_async(kwargs={ 'project_id': project_id, 'run_id': run_id, 'parent_span': prerun_check.get_serialized_span() }, link_error=run_failed_handler.s()) log.info("CLK similarity computation scheduled")
def remove_project(project_id): """ """ log = logger.bind(pid=project_id) log.debug("Remove all project resources") conn = db.connect_db() log.debug("Deleting project resourced from database") db.delete_project_data(conn, project_id) log.debug( "Getting object store files associated with project from database") object_store_files = db.get_all_objects_for_project(conn, project_id) log.debug( f"Removing {len(object_store_files)} object store files associated with project." ) delete_minio_objects.delay(object_store_files, project_id) log.info("Project resources removed")
def save_and_permute(similarity_result, project_id, run_id, parent_span): log = logger.bind(pid=project_id, run_id=run_id) log.debug("Saving and possibly permuting data") groups = similarity_result['groups'] # Note Postgres requires JSON object keys to be strings # Celery actually converts the json arguments in the same way with DBConn() as db: result_type = get_project_column(db, project_id, 'result_type') if result_type == "groups": # Save the raw groups log.debug("Saving the groups in the DB") result_id = insert_mapping_result(db, run_id, groups) else: # Turn groups into mapping and save that log.debug("Turning groups into mapping") mapping = groups_to_mapping(groups) log.debug("Saving mappuing in the DB") result_id = insert_mapping_result(db, run_id, mapping) dp_ids = get_dataprovider_ids(db, project_id) log.info("Result saved to db with result id {}".format(result_id)) if result_type == "permutations": log.debug("Submitting job to permute mapping") dataset0_size, dataset1_size = similarity_result['datasetSizes'] permute_mapping_data.apply_async( (project_id, run_id, dataset0_size, dataset1_size, save_and_permute.get_serialized_span())) else: log.debug("Mark job as complete") mark_run_complete.delay(run_id, save_and_permute.get_serialized_span()) # Post similarity computation cleanup log.debug("Removing clk filters from redis cache") for dp_id in dp_ids: cache.remove_from_cache(dp_id) calculate_comparison_rate.delay()
def solver_task(similarity_scores_filename, project_id, run_id, dataset_sizes, parent_span): log = logger.bind(pid=project_id, run_id=run_id) mc = connect_to_object_store() solver_task.span.log_kv({ 'datasetSizes': dataset_sizes, 'filename': similarity_scores_filename }) score_file = mc.get_object(config.MINIO_BUCKET, similarity_scores_filename) log.debug("Creating python sparse matrix from bytes data") candidate_pairs = anonlink.serialization.load_candidate_pairs(score_file) log.info("Calculating the optimal mapping from similarity matrix") groups = anonlink.solving.greedy_solve(candidate_pairs) log.info("Entity groups have been computed") res = {"groups": groups, "datasetSizes": dataset_sizes} save_and_permute.delay(res, project_id, run_id, solver_task.get_serialized_span())
def remove_project(project_id, parent_span=None): """ """ log = logger.bind(pid=project_id) log.debug("Remove all project resources") with DBConn() as conn: run_objects = db.get_runs(conn, project_id) log.debug("Setting run status as 'deleted'") for run in run_objects: set_run_state_deleted(run_id=run['run_id']) log.debug("Deleting project resourced from database") db.delete_project_data(conn, project_id) log.debug( "Getting object store files associated with project from database") object_store_files = db.get_all_objects_for_project(conn, project_id) delete_minio_objects.delay(object_store_files, project_id, parent_span) log.info("Project resources removed")
def solver_task(similarity_scores_filename, project_id, run_id, dataset_sizes, parent_span): log = logger.bind(pid=project_id, run_id=run_id) mc = connect_to_object_store() solver_task.span.log_kv({ 'datasetSizes': dataset_sizes, 'filename': similarity_scores_filename }) score_file = mc.get_object(config.MINIO_BUCKET, similarity_scores_filename) log.debug("Creating python sparse matrix from bytes data") candidate_pairs_with_duplicates = anonlink.serialization.load_candidate_pairs( score_file) similarity_scores, (dset_is0, dset_is1), (rec_is0, rec_is1) = candidate_pairs_with_duplicates log.info( f"Number of candidate pairs before deduplication: {len(candidate_pairs_with_duplicates[0])}" ) if len(candidate_pairs_with_duplicates[0]) > 0: # TODO use public interface when available # https://github.com/data61/anonlink/issues/271 candidate_pairs = _merge_similarities( [zip(similarity_scores, dset_is0, dset_is1, rec_is0, rec_is1)], k=None) log.info( f"Number of candidate pairs after deduplication: {len(candidate_pairs[0])}" ) log.info("Calculating the optimal mapping from similarity matrix") groups = anonlink.solving.greedy_solve(candidate_pairs) else: groups = [] log.info("Entity groups have been computed") res = {"groups": groups, "datasetSizes": dataset_sizes} save_and_permute.delay(res, project_id, run_id, solver_task.get_serialized_span())
def solver_task(similarity_scores_filename, project_id, run_id, lenf1, lenf2, parent_span): log = logger.bind(pid=project_id, run_id=run_id) mc = connect_to_object_store() solver_task.span.log_kv({ 'lenf1': lenf1, 'lenf2': lenf2, 'filename': similarity_scores_filename }) score_file = mc.get_object(config.MINIO_BUCKET, similarity_scores_filename) log.debug("Creating python sparse matrix from bytes data") sparse_matrix = similarity_matrix_from_csv_bytes(score_file.data) log.info("Calculating the optimal mapping from similarity matrix") mapping = anonlink.entitymatch.greedy_solver(sparse_matrix) log.debug("Converting all indices to strings") for key in mapping: mapping[key] = str(mapping[key]) log.info("Entity mapping has been computed") res = {"mapping": mapping, "lenf1": lenf1, "lenf2": lenf2} save_and_permute.delay(res, project_id, run_id, solver_task.get_serialized_span())
def compute_filter_similarity(package, project_id, run_id, threshold, encoding_size, parent_span=None): """Compute filter similarity between a chunk of filters in dataprovider 1, and a chunk of filters in dataprovider 2. :param dict chunk_info: A chunk returned by ``anonlink.concurrency.split_to_chunks``. :param project_id: :param run_id: :param threshold: :param encoding_size: The size in bytes of each encoded entry :param parent_span: A serialized opentracing span context. :returns A 3-tuple: (num_results, result size in bytes, results_filename_in_object_store, ) """ log = logger.bind(pid=project_id, run_id=run_id) task_span = compute_filter_similarity.span def new_child_span(name, parent_scope=None): if parent_scope is None: parent_scope = compute_filter_similarity return compute_filter_similarity.tracer.start_active_span( name, child_of=parent_scope.span) log.debug(f"Computing similarities for {len(package)} chunks of filters") log.debug( "Checking that the resource exists (in case of run being canceled/deleted)" ) assert_valid_run(project_id, run_id, log) #chunk_info_dp1, chunk_info_dp2 = chunk_info def reindex_using_encoding_ids(recordarray, encoding_id_list): # Map results from "index in chunk" to encoding id. return array.array('I', [encoding_id_list[i] for i in recordarray]) num_results = 0 num_comparisons = 0 sim_results = [] with DBConn() as conn: if len(package) > 1: # multiple full blocks in one package with new_child_span( f'fetching-encodings of package of size {len(package)}'): package = get_encoding_chunks(conn, package, encoding_size=encoding_size) else: # this chunk is all part of one block with new_child_span(f'fetching-encodings of package with 1 chunk'): chunk_info_dp1, chunk_info_dp2 = package[0] chunk_with_ids_dp1, chunk_dp1_size = get_encoding_chunk( conn, chunk_info_dp1, encoding_size) entity_ids_dp1, chunk_dp1 = zip(*chunk_with_ids_dp1) chunk_info_dp1['encodings'] = chunk_dp1 chunk_info_dp1['entity_ids'] = entity_ids_dp1 chunk_with_ids_dp2, chunk_dp2_size = get_encoding_chunk( conn, chunk_info_dp2, encoding_size) entity_ids_dp2, chunk_dp2 = zip(*chunk_with_ids_dp2) chunk_info_dp2['encodings'] = chunk_dp2 chunk_info_dp2['entity_ids'] = entity_ids_dp2 log.debug('All encodings for package are fetched and deserialized') log.debug("Calculating filter similarities for work package") with new_child_span('comparing-encodings') as parent_scope: for chunk_dp1, chunk_dp2 in package: enc_dp1 = chunk_dp1['encodings'] enc_dp1_size = len(enc_dp1) enc_dp2 = chunk_dp2['encodings'] enc_dp2_size = len(enc_dp2) assert enc_dp1_size > 0, "Zero sized chunk in dp1" assert enc_dp2_size > 0, "Zero sized chunk in dp2" try: sims, (rec_is0, rec_is1 ) = anonlink.similarities.dice_coefficient_accelerated( datasets=(enc_dp1, enc_dp2), threshold=threshold, k=min(enc_dp1_size, enc_dp2_size)) except NotImplementedError as e: log.warning( f"Encodings couldn't be compared using anonlink. {e}") return rec_is0 = reindex_using_encoding_ids(rec_is0, chunk_dp1['entity_ids']) rec_is1 = reindex_using_encoding_ids(rec_is1, chunk_dp2['entity_ids']) num_results += len(sims) num_comparisons += enc_dp1_size * enc_dp2_size sim_results.append( (sims, (rec_is0, rec_is1), chunk_dp1['datasetIndex'], chunk_dp2['datasetIndex'])) log.debug( f'comparison is done. {num_comparisons} comparisons got {num_results} pairs above the threshold' ) ##### progess reporting log.debug('Encoding similarities calculated') with new_child_span('update-comparison-progress') as scope: # Update the number of comparisons completed save_current_progress(num_comparisons, run_id) scope.span.log_kv({ 'comparisons': num_comparisons, 'num_similar': num_results }) log.debug("Comparisons: {}, Links above threshold: {}".format( num_comparisons, num_results)) ###### results into file into minio with new_child_span('save-comparison-results-to-minio'): file_iters = [] file_sizes = [] for sims, (rec_is0, rec_is1), dp1_ds_idx, dp2_ds_idx in sim_results: num_sims = len(sims) if num_sims: # Make index arrays for serialization index_1 = array.array('I', (dp1_ds_idx, )) * num_sims index_2 = array.array('I', (dp2_ds_idx, )) * num_sims chunk_results = sims, (index_1, index_2), (rec_is0, rec_is1), bytes_iter, file_size \ = anonlink.serialization.dump_candidate_pairs_iter(chunk_results) file_iters.append(iterable_to_stream(bytes_iter)) file_sizes.append(file_size) if len(file_iters) > 1: # we need to merge them first into one ordered thingy merged_file_iter, merged_file_size \ = anonlink.serialization.merge_streams_iter(file_iters, sizes=file_sizes) merged_file_iter = iterable_to_stream(merged_file_iter) elif len(file_iters) == 1: merged_file_iter = file_iters[0] merged_file_size = file_sizes[0] else: return 0, None, None result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format( generate_code(12)) task_span.log_kv({"edges": num_results}) log.info("Writing {} intermediate results to file: {}".format( num_results, result_filename)) mc = connect_to_object_store() try: mc.put_object(Config.MINIO_BUCKET, result_filename, merged_file_iter, merged_file_size) except minio.ResponseError as err: log.warning("Failed to store result in minio: {}".format(err)) raise return num_results, merged_file_size, result_filename
def create_comparison_jobs(project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) with DBConn() as conn: dp_ids = get_dataprovider_ids(conn, project_id) assert len(dp_ids) >= 2, "Expected at least 2 data providers" log.info("Starting comparison of CLKs from data provider ids: {}, {}".format(dp_ids[0], dp_ids[1])) current_span = create_comparison_jobs.span if not check_project_exists(conn, project_id) or not check_run_exists(conn, project_id, run_id): log.info("Skipping as project or run not found in database.") return run_info = get_run(conn, run_id) threshold = run_info['threshold'] dataset_sizes = get_project_dataset_sizes(conn, project_id) if len(dataset_sizes) < 2: log.warning("Unexpected number of dataset sizes in db. Stopping") update_run_mark_failure(conn, run_id) return else: lenf1, lenf2 = dataset_sizes encoding_size = get_project_encoding_size(conn, project_id) size = lenf1 * lenf2 log.info("Computing similarity for {} x {} entities".format(lenf1, lenf2)) current_span.log_kv({"event": 'get-dataset-sizes'}) filters1_object_filename = get_filter_metadata(conn, dp_ids[0]) filters2_object_filename = get_filter_metadata(conn, dp_ids[1]) current_span.log_kv({"event": 'get-metadata'}) log.debug("Chunking computation task") chunk_size = Config.get_task_chunk_size(size, threshold) if chunk_size is None: chunk_size = max(lenf1, lenf2) log.info("Chunks will contain {} entities per task".format(chunk_size)) update_run_chunk(conn, project_id, chunk_size) job_chunks = [] dp1_chunks = [] dp2_chunks = [] for chunk_start_index_dp1 in range(0, lenf1, chunk_size): dp1_chunks.append( (filters1_object_filename, chunk_start_index_dp1, min(chunk_start_index_dp1 + chunk_size, lenf1)) ) for chunk_start_index_dp2 in range(0, lenf2, chunk_size): dp2_chunks.append( (filters2_object_filename, chunk_start_index_dp2, min(chunk_start_index_dp2 + chunk_size, lenf2)) ) # Every chunk in dp1 has to be run against every chunk in dp2 for dp1_chunk in dp1_chunks: for dp2_chunk in dp2_chunks: job_chunks.append((dp1_chunk, dp2_chunk, )) log.info("Chunking into {} computation tasks each with (at most) {} entities.".format( len(job_chunks), chunk_size)) current_span.log_kv({"event": "chunking", "chunksize": chunk_size, 'num_chunks': len(job_chunks)}) span_serialized = create_comparison_jobs.get_serialized_span() # Prepare the Celery Chord that will compute all the similarity scores: scoring_tasks = [compute_filter_similarity.si( chunk_dp1, chunk_dp2, project_id, run_id, threshold, encoding_size, span_serialized ) for chunk_dp1, chunk_dp2 in job_chunks] if len(scoring_tasks) == 1: scoring_tasks.append(celery_bug_fix.si()) callback_task = aggregate_comparisons.s(project_id, run_id, parent_span=span_serialized).on_error( on_chord_error.s(run_id=run_id)) future = chord(scoring_tasks)(callback_task)
def compute_filter_similarity(chunk_info, project_id, run_id, threshold, encoding_size, parent_span=None): """Compute filter similarity between a chunk of filters in dataprovider 1, and a chunk of filters in dataprovider 2. :param dict chunk_info: A chunk returned by ``anonlink.concurrency.split_to_chunks``. :param project_id: :param run_id: :param threshold: :param encoding_size: The size in bytes of each encoded entry :param parent_span: A serialized opentracing span context. :returns A 3-tuple: (num_results, result size in bytes, results_filename_in_object_store, ) """ log = logger.bind(pid=project_id, run_id=run_id) task_span = compute_filter_similarity.span def new_child_span(name, parent_scope=None): if parent_scope is None: parent_scope = compute_filter_similarity return compute_filter_similarity.tracer.start_active_span( name, child_of=parent_scope.span) log.debug("Computing similarity for a chunk of filters") log.debug( "Checking that the resource exists (in case of run being canceled/deleted)" ) assert_valid_run(project_id, run_id, log) chunk_info_dp1, chunk_info_dp2 = chunk_info with DBConn() as conn: with new_child_span('fetching-encodings') as parent_scope: with new_child_span('fetching-left-encodings', parent_scope): log.debug( "Fetching and deserializing chunk of filters for dataprovider 1" ) chunk_with_ids_dp1, chunk_dp1_size = get_encoding_chunk( conn, chunk_info_dp1, encoding_size) entity_ids_dp1, chunk_dp1 = zip(*chunk_with_ids_dp1) with new_child_span('fetching-right-encodings', parent_scope): log.debug( "Fetching and deserializing chunk of filters for dataprovider 2" ) chunk_with_ids_dp2, chunk_dp2_size = get_encoding_chunk( conn, chunk_info_dp2, encoding_size) entity_ids_dp2, chunk_dp2 = zip(*chunk_with_ids_dp2) log.debug('Both chunks are fetched and deserialized') task_span.log_kv({ 'size1': chunk_dp1_size, 'size2': chunk_dp2_size, 'chunk_info': chunk_info }) assert chunk_dp1_size > 0, "Zero sized chunk in dp1" assert chunk_dp2_size > 0, "Zero sized chunk in dp2" with new_child_span('comparing-encodings') as parent_scope: log.debug("Calculating filter similarity") with new_child_span('dice-call', parent_scope): try: sims, (rec_is0, rec_is1 ) = anonlink.similarities.dice_coefficient_accelerated( datasets=(chunk_dp1, chunk_dp2), threshold=threshold, k=min(chunk_dp1_size, chunk_dp2_size)) except NotImplementedError as e: log.warning("Encodings couldn't be compared using anonlink.") return with new_child_span('reindex-call', parent_scope): def reindex_using_encoding_ids(recordarray, encoding_id_list): # Map results from "index in chunk" to encoding id. return array.array('I', [encoding_id_list[i] for i in recordarray]) rec_is0 = reindex_using_encoding_ids(rec_is0, entity_ids_dp1) rec_is1 = reindex_using_encoding_ids(rec_is1, entity_ids_dp2) log.debug('Encoding similarities calculated') with new_child_span('update-comparison-progress') as scope: # Update the number of comparisons completed comparisons_computed = chunk_dp1_size * chunk_dp2_size save_current_progress(comparisons_computed, run_id) scope.span.log_kv({ 'comparisons': comparisons_computed, 'num_similar': len(sims) }) log.debug("Comparisons: {}, Links above threshold: {}".format( comparisons_computed, len(sims))) with new_child_span('save-comparison-results-to-minio'): num_results = len(sims) if num_results: result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format( generate_code(12)) task_span.log_kv({"edges": num_results}) log.info("Writing {} intermediate results to file: {}".format( num_results, result_filename)) # Make index arrays for serialization index_1 = array.array( 'I', (chunk_info_dp1["datasetIndex"], )) * num_results index_2 = array.array( 'I', (chunk_info_dp2["datasetIndex"], )) * num_results chunk_results = sims, (index_1, index_2), (rec_is0, rec_is1), bytes_iter, file_size \ = anonlink.serialization.dump_candidate_pairs_iter(chunk_results) iter_stream = iterable_to_stream(bytes_iter) mc = connect_to_object_store() try: mc.put_object(Config.MINIO_BUCKET, result_filename, iter_stream, file_size) except minio.ResponseError as err: log.warning("Failed to store result in minio") raise else: result_filename = None file_size = None return num_results, file_size, result_filename
def permute_mapping_data(project_id, run_id, len_filters1, len_filters2, parent_span): """ Task which will create a permutation after a mapping has been completed. :param project_id: The project resource id :param run_id: The run id :param len_filters1: :param len_filters2: """ log = logger.bind(pid=project_id, run_id=run_id) with DBConn() as conn: mapping_str = get_run_result(conn, run_id) # Convert to int: int mapping = {int(k): int(mapping_str[k]) for k in mapping_str} log.info("Creating random permutations") log.debug( "Entities in dataset A: {}, Entities in dataset B: {}".format( len_filters1, len_filters2)) """ Pack all the entities that match in the **same** random locations in both permutations. Then fill in all the gaps! Dictionaries first, then converted to lists. """ smaller_dataset_size = min(len_filters1, len_filters2) log.debug("Smaller dataset size is {}".format(smaller_dataset_size)) number_in_common = len(mapping) a_permutation = {} # Should be length of filters1 b_permutation = {} # length of filters2 # By default mark all rows as NOT included in the mask mask = {i: False for i in range(smaller_dataset_size)} # start with all the possible indexes remaining_new_indexes = list(range(smaller_dataset_size)) log.info("Shuffling indices for matched entities") random.shuffle(remaining_new_indexes) log.info("Assigning random indexes for {} matched entities".format( number_in_common)) for mapping_number, a_index in enumerate(mapping): b_index = mapping[a_index] # Choose the index in the new mapping (randomly) mapping_index = remaining_new_indexes[mapping_number] a_permutation[a_index] = mapping_index b_permutation[b_index] = mapping_index # Mark the row included in the mask mask[mapping_index] = True remaining_new_indexes = set(remaining_new_indexes[number_in_common:]) log.info("Randomly adding all non matched entities") # Note the a and b datasets could be of different size. # At this point, both still have to use the remaining_new_indexes, and any # indexes that go over the number_in_common remaining_a_values = list( set(range(smaller_dataset_size, len_filters1)).union(remaining_new_indexes)) remaining_b_values = list( set(range(smaller_dataset_size, len_filters2)).union(remaining_new_indexes)) log.debug("Shuffle the remaining indices") random.shuffle(remaining_a_values) random.shuffle(remaining_b_values) # For every element in a's permutation for a_index in range(len_filters1): # Check if it is not already present if a_index not in a_permutation: # This index isn't yet mapped # choose and remove a random index from the extended list of those that remain # note this "could" be the same row (a NOP 1-1 permutation) mapping_index = remaining_a_values.pop() a_permutation[a_index] = mapping_index # For every eventual element in a's permutation for b_index in range(len_filters2): # Check if it is not already present if b_index not in b_permutation: # This index isn't yet mapped # choose and remove a random index from the extended list of those that remain # note this "could" be the same row (a NOP 1-1 permutation) mapping_index = remaining_b_values.pop() b_permutation[b_index] = mapping_index log.debug("Completed creating new permutations for each party") dp_ids = get_dataprovider_ids(conn, project_id) for i, permutation in enumerate([a_permutation, b_permutation]): # We convert here because celery and dicts with int keys don't play nice perm_list = convert_mapping_to_list(permutation) log.debug("Saving a permutation") insert_permutation(conn, dp_ids[i], run_id, perm_list) log.debug("Raw permutation data saved. Now saving raw mask") # Convert the mask dict to a list of 0/1 ints mask_list = convert_mapping_to_list( {int(key): 1 if value else 0 for key, value in mask.items()}) log.debug("Saving the mask") insert_permutation_mask(conn, project_id, run_id, mask_list) log.info("Mask saved") log.info("Committing database transaction") mark_run_complete.delay(run_id, permute_mapping_data.get_serialized_span())
def compute_filter_similarity(chunk_info, project_id, run_id, threshold, encoding_size, parent_span=None): """Compute filter similarity between a chunk of filters in dataprovider 1, and a chunk of filters in dataprovider 2. :param chunk_info: Chunk info returned by ``anonlink.concurrency.split_to_chunks``. Additionally, "storeFilename" is added to each dataset chunk. :param project_id: :param threshold: :param encoding_size: The size in bytes of each encoded entry :param parent_span: A serialized opentracing span context. @returns A 2-tuple: (num_results, results_filename_in_object_store) """ log = logger.bind(pid=project_id, run_id=run_id) log.debug("Computing similarity for a chunk of filters") span = compute_filter_similarity.span log.debug( "Checking that the resource exists (in case of job being canceled)") with DBConn() as db: if not check_project_exists(db, project_id) or not check_run_exists( db, project_id, run_id): log.info("Failing task as project or run not found in database.") raise DBResourceMissing("project or run not found in database") chunk_info_dp1, chunk_info_dp2 = chunk_info t0 = time.time() log.debug("Fetching and deserializing chunk of filters for dataprovider 1") chunk_dp1, chunk_dp1_size = get_chunk_from_object_store( chunk_info_dp1, encoding_size) t1 = time.time() log.debug("Fetching and deserializing chunk of filters for dataprovider 2") chunk_dp2, chunk_dp2_size = get_chunk_from_object_store( chunk_info_dp2, encoding_size) t2 = time.time() span.log_kv({'event': 'chunks are fetched and deserialized'}) log.debug("Calculating filter similarity") span.log_kv({'size1': chunk_dp1_size, 'size2': chunk_dp2_size}) chunk_results = anonlink.concurrency.process_chunk( chunk_info, (chunk_dp1, chunk_dp2), anonlink.similarities.dice_coefficient_accelerated, threshold, k=min(chunk_dp1_size, chunk_dp2_size)) t3 = time.time() span.log_kv({'event': 'similarities calculated'}) # Update the number of comparisons completed comparisons_computed = chunk_dp1_size * chunk_dp2_size save_current_progress(comparisons_computed, run_id) t4 = time.time() sims, _, _ = chunk_results num_results = len(sims) if num_results: result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format( generate_code(12)) log.info("Writing {} intermediate results to file: {}".format( num_results, result_filename)) bytes_iter, file_size \ = anonlink.serialization.dump_candidate_pairs_iter(chunk_results) iter_stream = iterable_to_stream(bytes_iter) mc = connect_to_object_store() try: mc.put_object(Config.MINIO_BUCKET, result_filename, iter_stream, file_size) except minio.ResponseError as err: log.warning("Failed to store result in minio") raise else: result_filename = None file_size = None t5 = time.time() log.info("run={} Comparisons: {}, Links above threshold: {}".format( run_id, comparisons_computed, len(chunk_results))) log.info( "Prep: {:.3f} + {:.3f}, Solve: {:.3f}, Progress: {:.3f}, Save: {:.3f}, Total: {:.3f}" .format(t1 - t0, t2 - t1, t3 - t2, t4 - t3, t5 - t4, t5 - t0)) return num_results, file_size, result_filename
def aggregate_comparisons(similarity_result_files, project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) if similarity_result_files is None: raise TypeError("Inappropriate argument type - missing results files.") files = [] for res in similarity_result_files: if res is None: log.warning( "Missing results during aggregation. Stopping processing.") raise TypeError( "Inappropriate argument type - results missing at aggregation step." ) num, filesize, filename = res if num: assert filesize is not None assert filename is not None files.append((num, filesize, filename)) else: assert filesize is None assert filename is None heapq.heapify(files) log.debug(f"Aggregating result chunks from {len(files)} files, " f"total size: {sum(map(operator.itemgetter(1), files))}") mc = connect_to_object_store() while len(files) > 1: file0 = heapq.heappop(files) file1 = heapq.heappop(files) merged_file = _merge_files(mc, log, file0, file1) heapq.heappush(files, merged_file) if not files: # No results. Let's chuck in an empty file. empty_file = _put_placeholder_empty_file(mc, log) files.append(empty_file) (merged_num, merged_filesize, merged_filename), = files log.info(f"Similarity score results in {merged_filename} in bucket " f"{Config.MINIO_BUCKET} take up {merged_filesize} bytes.") with DBConn() as db: result_type = get_project_column(db, project_id, 'result_type') result_id = insert_similarity_score_file(db, run_id, merged_filename) log.debug(f"Saved path to similarity scores file to db with id " f"{result_id}") if result_type == "similarity_scores": # Post similarity computation cleanup dp_ids = get_dataprovider_ids(db, project_id) else: # we promote the run to the next stage progress_stage(db, run_id) dataset_sizes = get_project_dataset_sizes(db, project_id) # DB now committed, we can fire off tasks that depend on the new db state if result_type == "similarity_scores": log.debug("Removing clk filters from redis cache") for dp_id in dp_ids: remove_from_cache(dp_id) # Complete the run log.info("Marking run as complete") mark_run_complete.delay(run_id, aggregate_comparisons.get_serialized_span()) else: solver_task.delay(merged_filename, project_id, run_id, dataset_sizes, aggregate_comparisons.get_serialized_span())
def create_comparison_jobs(project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) with DBConn() as conn: dp_ids = get_dataprovider_ids(conn, project_id) assert len(dp_ids) >= 2, "Expected at least 2 data providers" log.info(f"Starting comparison of CLKs from data provider ids: " f"{', '.join(map(str, dp_ids))}") current_span = create_comparison_jobs.span if not check_project_exists(conn, project_id) or not check_run_exists( conn, project_id, run_id): log.info("Skipping as project or run not found in database.") return run_info = get_run(conn, run_id) threshold = run_info['threshold'] dataset_sizes = get_project_dataset_sizes(conn, project_id) if len(dataset_sizes) < 2: log.warning("Unexpected number of dataset sizes in db. Stopping") update_run_mark_failure(conn, run_id) return encoding_size = get_project_encoding_size(conn, project_id) log.info(f"Computing similarity for " f"{' x '.join(map(str, dataset_sizes))} entities") current_span.log_kv({"event": 'get-dataset-sizes'}) filters_object_filenames = tuple( get_filter_metadata(conn, dp_id) for dp_id in dp_ids) current_span.log_kv({"event": 'get-metadata'}) log.debug("Chunking computation task") chunk_infos = tuple( anonlink.concurrency.split_to_chunks(Config.CHUNK_SIZE_AIM, dataset_sizes=dataset_sizes)) # Save filenames with chunk information. for chunk_info in chunk_infos: for chunk_dp_info in chunk_info: chunk_dp_index = chunk_dp_info['datasetIndex'] chunk_dp_store_filename = filters_object_filenames[chunk_dp_index] chunk_dp_info['storeFilename'] = chunk_dp_store_filename log.info(f"Chunking into {len(chunk_infos)} computation tasks") current_span.log_kv({"event": "chunking", 'num_chunks': len(chunk_infos)}) span_serialized = create_comparison_jobs.get_serialized_span() # Prepare the Celery Chord that will compute all the similarity scores: scoring_tasks = [ compute_filter_similarity.si(chunk_info, project_id, run_id, threshold, encoding_size, span_serialized) for chunk_info in chunk_infos ] if len(scoring_tasks) == 1: scoring_tasks.append(celery_bug_fix.si()) callback_task = aggregate_comparisons.s( project_id, run_id, parent_span=span_serialized).on_error(on_chord_error.s(run_id=run_id)) future = chord(scoring_tasks)(callback_task)
def handle_raw_upload(project_id, dp_id, receipt_token, parent_span=None): # User has uploaded base64 encodings as JSON log = logger.bind(pid=project_id, dp_id=dp_id) log.info("Handling user provided base64 encodings") with DBConn() as db: if not check_project_exists(db, project_id): log.info("Project deleted, stopping immediately") return expected_count = get_number_of_hashes(db, dp_id) log.info(f"Expecting to handle {expected_count} encodings") mc = connect_to_object_store() # Input file is line separated base64 record encodings. raw_file = Config.RAW_FILENAME_FMT.format(receipt_token) raw_data_response = mc.get_object(Config.MINIO_BUCKET, raw_file) # Set up streaming processing pipeline buffered_stream = iterable_to_stream(raw_data_response.stream()) text_stream = io.TextIOWrapper(buffered_stream, newline='\n') clkcounts = [] def filter_generator(): log.debug("Deserializing json filters") first_encoding_size = None for i, line in enumerate(text_stream): ba = deserialize_bitarray(line) yield (ba, i, ba.count()) clkcounts.append(ba.count()) encsize = len(ba) if i == 0: first_encoding_size = encsize if encsize != first_encoding_size: raise ValueError("Encodings were not all the same size") log.info(f"Processed {len(clkcounts)} hashes") # We peek at the first element as we need the encoding size # for the ret of our processing pipeline python_filters = more_itertools.peekable(filter_generator()) # Note the len of a bitarray is returned in bits but we require # this to be a multiple of 8 so we use bytes. uploaded_encoding_size = len(python_filters.peek()[0]) // 8 # This is the first time we've seen the encoding size from this data provider try: check_dataproviders_encoding(project_id, uploaded_encoding_size) except InvalidEncodingError as e: log.warning(e.args[0]) handle_invalid_encoding_data(project_id, dp_id) with DBConn() as db: # Save the encoding size as metadata update_encoding_metadata_set_encoding_size(db, dp_id, uploaded_encoding_size) # Output file is our custom binary packed file filename = Config.BIN_FILENAME_FMT.format(receipt_token) bit_packed_element_size = binary_format(uploaded_encoding_size).size num_bytes = expected_count * bit_packed_element_size # If small enough preload the data into our redis cache if expected_count < Config.ENTITY_CACHE_THRESHOLD: log.info("Caching pickled clk data") python_filters = list(python_filters) cache.set_deserialized_filter(dp_id, python_filters) else: log.info("Not caching clk data as it is too large") packed_filters = binary_pack_filters(python_filters, uploaded_encoding_size) packed_filter_stream = iterable_to_stream(packed_filters) # Upload to object store log.info( f"Uploading {expected_count} encodings of size {uploaded_encoding_size} " + f"to object store. Total Size: {fmt_bytes(num_bytes)}") mc.put_object(Config.MINIO_BUCKET, filename, data=packed_filter_stream, length=num_bytes) with DBConn() as conn: update_encoding_metadata(conn, filename, dp_id, 'ready') # Now work out if all parties have added their data if clks_uploaded_to_project(project_id, check_data_ready=True): log.info("All parties' data present. Scheduling any queued runs") check_for_executable_runs.delay( project_id, handle_raw_upload.get_serialized_span())
def compute_filter_similarity(chunk_info_dp1, chunk_info_dp2, project_id, run_id, threshold, encoding_size, parent_span=None): """Compute filter similarity between a chunk of filters in dataprovider 1, and a chunk of filters in dataprovider 2. :param chunk_info_dp1: A tuple containing: - object store filename - Chunk start index - Chunk stop index :param chunk_info_dp2: :param project_id: :param threshold: :param encoding_size: The size in bytes of each encoded entry :param parent_span: A serialized opentracing span context. """ log = logger.bind(pid=project_id, run_id=run_id) log.debug("Computing similarity for a chunk of filters") span = compute_filter_similarity.span log.debug("Checking that the resource exists (in case of job being canceled)") with DBConn() as db: if not check_project_exists(db, project_id) or not check_run_exists(db, project_id, run_id): log.info("Stopping as project or run not found in database.") return None t0 = time.time() log.debug("Fetching and deserializing chunk of filters for dataprovider 1") chunk_dp1, chunk_dp1_size = get_chunk_from_object_store(chunk_info_dp1, encoding_size) t1 = time.time() log.debug("Fetching and deserializing chunk of filters for dataprovider 2") chunk_dp2, chunk_dp2_size = get_chunk_from_object_store(chunk_info_dp2, encoding_size) t2 = time.time() span.log_kv({'event': 'chunks are fetched and deserialized'}) log.debug("Calculating filter similarity") span.log_kv({'size1': chunk_dp1_size, 'size2': chunk_dp2_size}) chunk_results = anonlink.entitymatch.calculate_filter_similarity(chunk_dp1, chunk_dp2, threshold=threshold, k=min(chunk_dp1_size, chunk_dp2_size), use_python=False) t3 = time.time() span.log_kv({'event': 'similarities calculated'}) # Update the number of comparisons completed comparisons_computed = chunk_dp1_size * chunk_dp2_size save_current_progress(comparisons_computed, run_id) t4 = time.time() partial_sparse_result = [] # offset chunk's index offset_dp1 = chunk_info_dp1[1] offset_dp2 = chunk_info_dp2[1] log.debug("Offset DP1 by: {}, DP2 by: {}".format(offset_dp1, offset_dp2)) for (ia, score, ib) in chunk_results: partial_sparse_result.append((ia + offset_dp1, ib + offset_dp2, score)) t5 = time.time() num_results = len(partial_sparse_result) if num_results > 0: result_filename = 'chunk-res-{}.csv'.format(generate_code(12)) log.info("Writing {} intermediate results to file: {}".format(num_results, result_filename)) with open(result_filename, 'wt') as f: csvwriter = csv.writer(f) csvwriter.writerows(partial_sparse_result) # Now write these to the object store. and return the filename and summary # Will write a csv file for now mc = connect_to_object_store() try: mc.fput_object(Config.MINIO_BUCKET, result_filename, result_filename) except minio.ResponseError as err: log.warning("Failed to store result in minio") raise # If we don't delete the file we *do* run out of space os.remove(result_filename) else: result_filename = None t6 = time.time() log.info("run={} Comparisons: {}, Links above threshold: {}".format(run_id, comparisons_computed, len(chunk_results))) log.info("Prep: {:.3f} + {:.3f}, Solve: {:.3f}, Progress: {:.3f}, Offset: {:.3f}, Save: {:.3f}, Total: {:.3f}".format( t1 - t0, t2 - t1, t3 - t2, t4 - t3, t4 - t4, t6 - t5, t6 - t0) ) return num_results, result_filename