def prerun_check(project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) log.debug("Sanity check that we need to compute run") with DBConn() as conn: if not check_project_exists(conn, project_id): log.debug("Project not found. Skipping") raise ProjectDeleted(project_id) res = get_run(conn, run_id) if res is None: log.debug(f"Run not found. Skipping") raise RunDeleted(run_id) try: state = get_run_state_for_update(conn, run_id) except psycopg2.OperationalError: log.warning("Run started in another task. Skipping this race.") return if state in {'running', 'completed', 'error'}: log.warning("Run already started. Skipping") return log.debug("Setting run as in progress") update_run_set_started(conn, run_id) log.debug("Getting dp ids for compute similarity task") dp_ids = get_dataprovider_ids(conn, project_id) log.debug("Data providers: {}".format(dp_ids)) create_comparison_jobs.delay(project_id, run_id, prerun_check.get_serialized_span()) log.info("CLK similarity computation scheduled")
def assert_valid_run(project_id, run_id, log): if not is_run_active(run_id): raise InactiveRun("Run isn't marked as active") with DBConn() as db: if not check_project_exists(db, project_id) or not check_run_exists( db, project_id, run_id): log.info("Project or run not found in database.") raise DBResourceMissing("project or run not found in database")
def prerun_check(project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) log.debug("Sanity check that we need to compute run") # being very defensive here checking if the run state is already in the redis cache if not is_run_missing(run_id): log.warning( "unexpectedly the run state is present in redis before starting") return with DBConn() as conn: if not check_project_exists(conn, project_id): log.debug("Project not found. Skipping") raise ProjectDeleted(project_id) res = get_run(conn, run_id) if res is None: log.debug(f"Run not found. Skipping") raise RunDeleted(run_id) try: db_state = get_run_state_for_update(conn, run_id) except psycopg2.OperationalError: log.warning("Run started in another task. Skipping this race.") return if db_state in {'running', 'completed', 'error'}: log.warning("Run already started. Skipping") return log.debug("Setting run state in db as 'running'") update_run_set_started(conn, run_id) log.debug("Updating redis cache for run") set_run_state_active(run_id) create_comparison_jobs.apply_async(kwargs={ 'project_id': project_id, 'run_id': run_id, 'parent_span': prerun_check.get_serialized_span() }, link_error=run_failed_handler.s()) log.info("CLK similarity computation scheduled")
def check_run_active(conn, project_id, run_id): """Raises InactiveRun if the project or run has been deleted from the database. """ if not check_project_exists(conn, project_id) or not check_run_exists( conn, project_id, run_id): raise InactiveRun("Skipping as project or run not found in database.")
def abort_if_project_doesnt_exist(project_id): conn = get_db() resource_exists = db.check_project_exists(conn, project_id) if not resource_exists: logger.info("Requested project resource with invalid identifier token") safe_fail_request(403, message=INVALID_ACCESS_MSG)
def create_comparison_jobs(project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) with DBConn() as conn: dp_ids = get_dataprovider_ids(conn, project_id) assert len(dp_ids) >= 2, "Expected at least 2 data providers" log.info(f"Starting comparison of CLKs from data provider ids: " f"{', '.join(map(str, dp_ids))}") current_span = create_comparison_jobs.span if not check_project_exists(conn, project_id) or not check_run_exists( conn, project_id, run_id): log.info("Skipping as project or run not found in database.") return run_info = get_run(conn, run_id) threshold = run_info['threshold'] dataset_sizes = get_project_dataset_sizes(conn, project_id) if len(dataset_sizes) < 2: log.warning("Unexpected number of dataset sizes in db. Stopping") update_run_mark_failure(conn, run_id) return encoding_size = get_project_encoding_size(conn, project_id) log.info(f"Computing similarity for " f"{' x '.join(map(str, dataset_sizes))} entities") current_span.log_kv({"event": 'get-dataset-sizes'}) filters_object_filenames = tuple( get_filter_metadata(conn, dp_id) for dp_id in dp_ids) current_span.log_kv({"event": 'get-metadata'}) log.debug("Chunking computation task") chunk_infos = tuple( anonlink.concurrency.split_to_chunks(Config.CHUNK_SIZE_AIM, dataset_sizes=dataset_sizes)) # Save filenames with chunk information. for chunk_info in chunk_infos: for chunk_dp_info in chunk_info: chunk_dp_index = chunk_dp_info['datasetIndex'] chunk_dp_store_filename = filters_object_filenames[chunk_dp_index] chunk_dp_info['storeFilename'] = chunk_dp_store_filename log.info(f"Chunking into {len(chunk_infos)} computation tasks") current_span.log_kv({"event": "chunking", 'num_chunks': len(chunk_infos)}) span_serialized = create_comparison_jobs.get_serialized_span() # Prepare the Celery Chord that will compute all the similarity scores: scoring_tasks = [ compute_filter_similarity.si(chunk_info, project_id, run_id, threshold, encoding_size, span_serialized) for chunk_info in chunk_infos ] if len(scoring_tasks) == 1: scoring_tasks.append(celery_bug_fix.si()) callback_task = aggregate_comparisons.s( project_id, run_id, parent_span=span_serialized).on_error(on_chord_error.s(run_id=run_id)) future = chord(scoring_tasks)(callback_task)
def compute_filter_similarity(chunk_info, project_id, run_id, threshold, encoding_size, parent_span=None): """Compute filter similarity between a chunk of filters in dataprovider 1, and a chunk of filters in dataprovider 2. :param chunk_info: Chunk info returned by ``anonlink.concurrency.split_to_chunks``. Additionally, "storeFilename" is added to each dataset chunk. :param project_id: :param threshold: :param encoding_size: The size in bytes of each encoded entry :param parent_span: A serialized opentracing span context. @returns A 2-tuple: (num_results, results_filename_in_object_store) """ log = logger.bind(pid=project_id, run_id=run_id) log.debug("Computing similarity for a chunk of filters") span = compute_filter_similarity.span log.debug( "Checking that the resource exists (in case of job being canceled)") with DBConn() as db: if not check_project_exists(db, project_id) or not check_run_exists( db, project_id, run_id): log.info("Failing task as project or run not found in database.") raise DBResourceMissing("project or run not found in database") chunk_info_dp1, chunk_info_dp2 = chunk_info t0 = time.time() log.debug("Fetching and deserializing chunk of filters for dataprovider 1") chunk_dp1, chunk_dp1_size = get_chunk_from_object_store( chunk_info_dp1, encoding_size) t1 = time.time() log.debug("Fetching and deserializing chunk of filters for dataprovider 2") chunk_dp2, chunk_dp2_size = get_chunk_from_object_store( chunk_info_dp2, encoding_size) t2 = time.time() span.log_kv({'event': 'chunks are fetched and deserialized'}) log.debug("Calculating filter similarity") span.log_kv({'size1': chunk_dp1_size, 'size2': chunk_dp2_size}) chunk_results = anonlink.concurrency.process_chunk( chunk_info, (chunk_dp1, chunk_dp2), anonlink.similarities.dice_coefficient_accelerated, threshold, k=min(chunk_dp1_size, chunk_dp2_size)) t3 = time.time() span.log_kv({'event': 'similarities calculated'}) # Update the number of comparisons completed comparisons_computed = chunk_dp1_size * chunk_dp2_size save_current_progress(comparisons_computed, run_id) t4 = time.time() sims, _, _ = chunk_results num_results = len(sims) if num_results: result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format( generate_code(12)) log.info("Writing {} intermediate results to file: {}".format( num_results, result_filename)) bytes_iter, file_size \ = anonlink.serialization.dump_candidate_pairs_iter(chunk_results) iter_stream = iterable_to_stream(bytes_iter) mc = connect_to_object_store() try: mc.put_object(Config.MINIO_BUCKET, result_filename, iter_stream, file_size) except minio.ResponseError as err: log.warning("Failed to store result in minio") raise else: result_filename = None file_size = None t5 = time.time() log.info("run={} Comparisons: {}, Links above threshold: {}".format( run_id, comparisons_computed, len(chunk_results))) log.info( "Prep: {:.3f} + {:.3f}, Solve: {:.3f}, Progress: {:.3f}, Save: {:.3f}, Total: {:.3f}" .format(t1 - t0, t2 - t1, t3 - t2, t4 - t3, t5 - t4, t5 - t0)) return num_results, file_size, result_filename
def create_comparison_jobs(project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) with DBConn() as conn: dp_ids = get_dataprovider_ids(conn, project_id) assert len(dp_ids) >= 2, "Expected at least 2 data providers" log.info("Starting comparison of CLKs from data provider ids: {}, {}".format(dp_ids[0], dp_ids[1])) current_span = create_comparison_jobs.span if not check_project_exists(conn, project_id) or not check_run_exists(conn, project_id, run_id): log.info("Skipping as project or run not found in database.") return run_info = get_run(conn, run_id) threshold = run_info['threshold'] dataset_sizes = get_project_dataset_sizes(conn, project_id) if len(dataset_sizes) < 2: log.warning("Unexpected number of dataset sizes in db. Stopping") update_run_mark_failure(conn, run_id) return else: lenf1, lenf2 = dataset_sizes encoding_size = get_project_encoding_size(conn, project_id) size = lenf1 * lenf2 log.info("Computing similarity for {} x {} entities".format(lenf1, lenf2)) current_span.log_kv({"event": 'get-dataset-sizes'}) filters1_object_filename = get_filter_metadata(conn, dp_ids[0]) filters2_object_filename = get_filter_metadata(conn, dp_ids[1]) current_span.log_kv({"event": 'get-metadata'}) log.debug("Chunking computation task") chunk_size = Config.get_task_chunk_size(size, threshold) if chunk_size is None: chunk_size = max(lenf1, lenf2) log.info("Chunks will contain {} entities per task".format(chunk_size)) update_run_chunk(conn, project_id, chunk_size) job_chunks = [] dp1_chunks = [] dp2_chunks = [] for chunk_start_index_dp1 in range(0, lenf1, chunk_size): dp1_chunks.append( (filters1_object_filename, chunk_start_index_dp1, min(chunk_start_index_dp1 + chunk_size, lenf1)) ) for chunk_start_index_dp2 in range(0, lenf2, chunk_size): dp2_chunks.append( (filters2_object_filename, chunk_start_index_dp2, min(chunk_start_index_dp2 + chunk_size, lenf2)) ) # Every chunk in dp1 has to be run against every chunk in dp2 for dp1_chunk in dp1_chunks: for dp2_chunk in dp2_chunks: job_chunks.append((dp1_chunk, dp2_chunk, )) log.info("Chunking into {} computation tasks each with (at most) {} entities.".format( len(job_chunks), chunk_size)) current_span.log_kv({"event": "chunking", "chunksize": chunk_size, 'num_chunks': len(job_chunks)}) span_serialized = create_comparison_jobs.get_serialized_span() # Prepare the Celery Chord that will compute all the similarity scores: scoring_tasks = [compute_filter_similarity.si( chunk_dp1, chunk_dp2, project_id, run_id, threshold, encoding_size, span_serialized ) for chunk_dp1, chunk_dp2 in job_chunks] if len(scoring_tasks) == 1: scoring_tasks.append(celery_bug_fix.si()) callback_task = aggregate_comparisons.s(project_id, run_id, parent_span=span_serialized).on_error( on_chord_error.s(run_id=run_id)) future = chord(scoring_tasks)(callback_task)
def compute_filter_similarity(chunk_info_dp1, chunk_info_dp2, project_id, run_id, threshold, encoding_size, parent_span=None): """Compute filter similarity between a chunk of filters in dataprovider 1, and a chunk of filters in dataprovider 2. :param chunk_info_dp1: A tuple containing: - object store filename - Chunk start index - Chunk stop index :param chunk_info_dp2: :param project_id: :param threshold: :param encoding_size: The size in bytes of each encoded entry :param parent_span: A serialized opentracing span context. """ log = logger.bind(pid=project_id, run_id=run_id) log.debug("Computing similarity for a chunk of filters") span = compute_filter_similarity.span log.debug("Checking that the resource exists (in case of job being canceled)") with DBConn() as db: if not check_project_exists(db, project_id) or not check_run_exists(db, project_id, run_id): log.info("Stopping as project or run not found in database.") return None t0 = time.time() log.debug("Fetching and deserializing chunk of filters for dataprovider 1") chunk_dp1, chunk_dp1_size = get_chunk_from_object_store(chunk_info_dp1, encoding_size) t1 = time.time() log.debug("Fetching and deserializing chunk of filters for dataprovider 2") chunk_dp2, chunk_dp2_size = get_chunk_from_object_store(chunk_info_dp2, encoding_size) t2 = time.time() span.log_kv({'event': 'chunks are fetched and deserialized'}) log.debug("Calculating filter similarity") span.log_kv({'size1': chunk_dp1_size, 'size2': chunk_dp2_size}) chunk_results = anonlink.entitymatch.calculate_filter_similarity(chunk_dp1, chunk_dp2, threshold=threshold, k=min(chunk_dp1_size, chunk_dp2_size), use_python=False) t3 = time.time() span.log_kv({'event': 'similarities calculated'}) # Update the number of comparisons completed comparisons_computed = chunk_dp1_size * chunk_dp2_size save_current_progress(comparisons_computed, run_id) t4 = time.time() partial_sparse_result = [] # offset chunk's index offset_dp1 = chunk_info_dp1[1] offset_dp2 = chunk_info_dp2[1] log.debug("Offset DP1 by: {}, DP2 by: {}".format(offset_dp1, offset_dp2)) for (ia, score, ib) in chunk_results: partial_sparse_result.append((ia + offset_dp1, ib + offset_dp2, score)) t5 = time.time() num_results = len(partial_sparse_result) if num_results > 0: result_filename = 'chunk-res-{}.csv'.format(generate_code(12)) log.info("Writing {} intermediate results to file: {}".format(num_results, result_filename)) with open(result_filename, 'wt') as f: csvwriter = csv.writer(f) csvwriter.writerows(partial_sparse_result) # Now write these to the object store. and return the filename and summary # Will write a csv file for now mc = connect_to_object_store() try: mc.fput_object(Config.MINIO_BUCKET, result_filename, result_filename) except minio.ResponseError as err: log.warning("Failed to store result in minio") raise # If we don't delete the file we *do* run out of space os.remove(result_filename) else: result_filename = None t6 = time.time() log.info("run={} Comparisons: {}, Links above threshold: {}".format(run_id, comparisons_computed, len(chunk_results))) log.info("Prep: {:.3f} + {:.3f}, Solve: {:.3f}, Progress: {:.3f}, Offset: {:.3f}, Save: {:.3f}, Total: {:.3f}".format( t1 - t0, t2 - t1, t3 - t2, t4 - t3, t4 - t4, t6 - t5, t6 - t0) ) return num_results, result_filename