def prerun_check(project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) log.debug("Sanity check that we need to compute run") with DBConn() as conn: if not check_project_exists(conn, project_id): log.debug("Project not found. Skipping") raise ProjectDeleted(project_id) res = get_run(conn, run_id) if res is None: log.debug(f"Run not found. Skipping") raise RunDeleted(run_id) try: state = get_run_state_for_update(conn, run_id) except psycopg2.OperationalError: log.warning("Run started in another task. Skipping this race.") return if state in {'running', 'completed', 'error'}: log.warning("Run already started. Skipping") return log.debug("Setting run as in progress") update_run_set_started(conn, run_id) log.debug("Getting dp ids for compute similarity task") dp_ids = get_dataprovider_ids(conn, project_id) log.debug("Data providers: {}".format(dp_ids)) create_comparison_jobs.delay(project_id, run_id, prerun_check.get_serialized_span()) log.info("CLK similarity computation scheduled")
def create_comparison_jobs(project_id, run_id, parent_span=None): """Schedule all the entity comparisons as sub tasks for a run. At a high level this task: - checks if the project and run have been deleted and if so aborts. - retrieves metadata: the number and size of the datasets, the encoding size, and the number and size of blocks. - splits the work into independent "chunks" and schedules them to run in celery - schedules the follow up task to run after all the comparisons have been computed. """ log = logger.bind(pid=project_id, run_id=run_id) current_span = create_comparison_jobs.span with DBConn() as conn: check_run_active(conn, project_id, run_id) dp_ids = get_dataprovider_ids(conn, project_id) number_of_datasets = len(dp_ids) assert number_of_datasets >= 2, "Expected at least 2 data providers" log.info(f"Scheduling comparison of CLKs from data provider ids: " f"{', '.join(map(str, dp_ids))}") # Retrieve required metadata dataset_sizes, dp_block_sizes = _retrieve_blocked_dataset_sizes( conn, project_id, dp_ids) log.info("Finding blocks in common between dataproviders") common_blocks = _get_common_blocks(dp_block_sizes, dp_ids) # We pass the encoding_size and threshold to the comparison tasks to minimize their db lookups encoding_size = get_project_encoding_size(conn, project_id) threshold = get_run(conn, run_id)['threshold'] log.debug("Chunking computation task") # Create "chunks" of comparisons chunks = _create_work_chunks(common_blocks, dp_block_sizes, dp_ids, log) log.info(f"Chunking into {len(chunks)} computation tasks") current_span.log_kv({ "event": "chunking", 'num_chunks': len(chunks), 'dataset-sizes': dataset_sizes }) span_serialized = create_comparison_jobs.get_serialized_span() # Prepare the Celery Chord that will compute all the similarity scores: scoring_tasks = [ compute_filter_similarity.si(chunk_info, project_id, run_id, threshold, encoding_size, span_serialized) for chunk_info in chunks ] if len(scoring_tasks) == 1: scoring_tasks.append(celery_bug_fix.si()) callback_task = aggregate_comparisons.s( project_id=project_id, run_id=run_id, parent_span=span_serialized).on_error( run_failed_handler.s(run_id=run_id)) log.info(f"Scheduling comparison tasks") future = chord(scoring_tasks)(callback_task)
def get(project_id, run_id): log = logger.bind(pid=project_id, rid=run_id) log.info("request description of a run") authorize_run_detail(project_id, run_id) log.debug("request for run description authorized") with db.DBConn() as conn: log.debug("Retrieving run description from database") run_object = db.get_run(conn, run_id) return RunDescription().dump(run_object)
def get(project_id, run_id): log = logger.bind(pid=project_id, rid=run_id) logger.info("request description of a run") # Check the project and run resources exist abort_if_run_doesnt_exist(project_id, run_id) # Check the caller has a valid results token. Yes it should be renamed. abort_if_invalid_results_token(project_id, request.headers.get('Authorization')) log.info("request for run description authorized") db_conn = db.get_db() run_object = db.get_run(db_conn, run_id) return RunDescription().dump(run_object)
def prerun_check(project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) log.debug("Sanity check that we need to compute run") # being very defensive here checking if the run state is already in the redis cache if not is_run_missing(run_id): log.warning( "unexpectedly the run state is present in redis before starting") return with DBConn() as conn: if not check_project_exists(conn, project_id): log.debug("Project not found. Skipping") raise ProjectDeleted(project_id) res = get_run(conn, run_id) if res is None: log.debug(f"Run not found. Skipping") raise RunDeleted(run_id) try: db_state = get_run_state_for_update(conn, run_id) except psycopg2.OperationalError: log.warning("Run started in another task. Skipping this race.") return if db_state in {'running', 'completed', 'error'}: log.warning("Run already started. Skipping") return log.debug("Setting run state in db as 'running'") update_run_set_started(conn, run_id) log.debug("Updating redis cache for run") set_run_state_active(run_id) create_comparison_jobs.apply_async(kwargs={ 'project_id': project_id, 'run_id': run_id, 'parent_span': prerun_check.get_serialized_span() }, link_error=run_failed_handler.s()) log.info("CLK similarity computation scheduled")
def create_comparison_jobs(project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) with DBConn() as conn: dp_ids = get_dataprovider_ids(conn, project_id) assert len(dp_ids) >= 2, "Expected at least 2 data providers" log.info(f"Starting comparison of CLKs from data provider ids: " f"{', '.join(map(str, dp_ids))}") current_span = create_comparison_jobs.span if not check_project_exists(conn, project_id) or not check_run_exists( conn, project_id, run_id): log.info("Skipping as project or run not found in database.") return run_info = get_run(conn, run_id) threshold = run_info['threshold'] dataset_sizes = get_project_dataset_sizes(conn, project_id) if len(dataset_sizes) < 2: log.warning("Unexpected number of dataset sizes in db. Stopping") update_run_mark_failure(conn, run_id) return encoding_size = get_project_encoding_size(conn, project_id) log.info(f"Computing similarity for " f"{' x '.join(map(str, dataset_sizes))} entities") current_span.log_kv({"event": 'get-dataset-sizes'}) filters_object_filenames = tuple( get_filter_metadata(conn, dp_id) for dp_id in dp_ids) current_span.log_kv({"event": 'get-metadata'}) log.debug("Chunking computation task") chunk_infos = tuple( anonlink.concurrency.split_to_chunks(Config.CHUNK_SIZE_AIM, dataset_sizes=dataset_sizes)) # Save filenames with chunk information. for chunk_info in chunk_infos: for chunk_dp_info in chunk_info: chunk_dp_index = chunk_dp_info['datasetIndex'] chunk_dp_store_filename = filters_object_filenames[chunk_dp_index] chunk_dp_info['storeFilename'] = chunk_dp_store_filename log.info(f"Chunking into {len(chunk_infos)} computation tasks") current_span.log_kv({"event": "chunking", 'num_chunks': len(chunk_infos)}) span_serialized = create_comparison_jobs.get_serialized_span() # Prepare the Celery Chord that will compute all the similarity scores: scoring_tasks = [ compute_filter_similarity.si(chunk_info, project_id, run_id, threshold, encoding_size, span_serialized) for chunk_info in chunk_infos ] if len(scoring_tasks) == 1: scoring_tasks.append(celery_bug_fix.si()) callback_task = aggregate_comparisons.s( project_id, run_id, parent_span=span_serialized).on_error(on_chord_error.s(run_id=run_id)) future = chord(scoring_tasks)(callback_task)
def create_comparison_jobs(project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) with DBConn() as conn: dp_ids = get_dataprovider_ids(conn, project_id) assert len(dp_ids) >= 2, "Expected at least 2 data providers" log.info("Starting comparison of CLKs from data provider ids: {}, {}".format(dp_ids[0], dp_ids[1])) current_span = create_comparison_jobs.span if not check_project_exists(conn, project_id) or not check_run_exists(conn, project_id, run_id): log.info("Skipping as project or run not found in database.") return run_info = get_run(conn, run_id) threshold = run_info['threshold'] dataset_sizes = get_project_dataset_sizes(conn, project_id) if len(dataset_sizes) < 2: log.warning("Unexpected number of dataset sizes in db. Stopping") update_run_mark_failure(conn, run_id) return else: lenf1, lenf2 = dataset_sizes encoding_size = get_project_encoding_size(conn, project_id) size = lenf1 * lenf2 log.info("Computing similarity for {} x {} entities".format(lenf1, lenf2)) current_span.log_kv({"event": 'get-dataset-sizes'}) filters1_object_filename = get_filter_metadata(conn, dp_ids[0]) filters2_object_filename = get_filter_metadata(conn, dp_ids[1]) current_span.log_kv({"event": 'get-metadata'}) log.debug("Chunking computation task") chunk_size = Config.get_task_chunk_size(size, threshold) if chunk_size is None: chunk_size = max(lenf1, lenf2) log.info("Chunks will contain {} entities per task".format(chunk_size)) update_run_chunk(conn, project_id, chunk_size) job_chunks = [] dp1_chunks = [] dp2_chunks = [] for chunk_start_index_dp1 in range(0, lenf1, chunk_size): dp1_chunks.append( (filters1_object_filename, chunk_start_index_dp1, min(chunk_start_index_dp1 + chunk_size, lenf1)) ) for chunk_start_index_dp2 in range(0, lenf2, chunk_size): dp2_chunks.append( (filters2_object_filename, chunk_start_index_dp2, min(chunk_start_index_dp2 + chunk_size, lenf2)) ) # Every chunk in dp1 has to be run against every chunk in dp2 for dp1_chunk in dp1_chunks: for dp2_chunk in dp2_chunks: job_chunks.append((dp1_chunk, dp2_chunk, )) log.info("Chunking into {} computation tasks each with (at most) {} entities.".format( len(job_chunks), chunk_size)) current_span.log_kv({"event": "chunking", "chunksize": chunk_size, 'num_chunks': len(job_chunks)}) span_serialized = create_comparison_jobs.get_serialized_span() # Prepare the Celery Chord that will compute all the similarity scores: scoring_tasks = [compute_filter_similarity.si( chunk_dp1, chunk_dp2, project_id, run_id, threshold, encoding_size, span_serialized ) for chunk_dp1, chunk_dp2 in job_chunks] if len(scoring_tasks) == 1: scoring_tasks.append(celery_bug_fix.si()) callback_task = aggregate_comparisons.s(project_id, run_id, parent_span=span_serialized).on_error( on_chord_error.s(run_id=run_id)) future = chord(scoring_tasks)(callback_task)