def save_and_permute(similarity_result, project_id, run_id, parent_span): log = logger.bind(pid=project_id, run_id=run_id) log.debug("Saving and possibly permuting data") mapping = similarity_result['mapping'] # Note Postgres requires JSON object keys to be strings # Celery actually converts the json arguments in the same way with DBConn() as db: result_type = get_project_column(db, project_id, 'result_type') # Just save the raw "mapping" log.debug("Saving the resulting map data to the db") result_id = insert_mapping_result(db, run_id, mapping) dp_ids = get_dataprovider_ids(db, project_id) log.info("Mapping result saved to db with result id {}".format(result_id)) if result_type == "permutations": log.debug("Submitting job to permute mapping") permute_mapping_data.apply_async( (project_id, run_id, similarity_result['lenf1'], similarity_result['lenf2'], save_and_permute.get_serialized_span())) else: log.debug("Mark mapping job as complete") mark_run_complete.delay(run_id, save_and_permute.get_serialized_span()) # Post similarity computation cleanup log.debug("Removing clk filters from redis cache") for dp_id in dp_ids: cache.remove_from_cache(dp_id) calculate_comparison_rate.delay()
def aggregate_comparisons(similarity_result_files, project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) if similarity_result_files is None: return mc = connect_to_object_store() files = [] data_size = 0 for num, filename in similarity_result_files: if num > 0: files.append(filename) data_size += mc.stat_object(Config.MINIO_BUCKET, filename).size log.debug("Aggregating result chunks from {} files, total size: {}".format( len(files), fmt_bytes(data_size))) result_file_stream_generator = (mc.get_object(Config.MINIO_BUCKET, result_filename) for result_filename in files) log.info("Similarity score results are {}".format(fmt_bytes(data_size))) result_stream = chain_streams(result_file_stream_generator) with DBConn() as db: result_type = get_project_column(db, project_id, 'result_type') # Note: Storing the similarity scores for all result types result_filename = store_similarity_scores(result_stream, run_id, data_size, db) if result_type == "similarity_scores": # Post similarity computation cleanup dp_ids = get_dataprovider_ids(db, project_id) else: # we promote the run to the next stage progress_stage(db, run_id) lenf1, lenf2 = get_project_dataset_sizes(db, project_id) # DB now committed, we can fire off tasks that depend on the new db state if result_type == "similarity_scores": log.info("Deleting intermediate similarity score files from object store") mc.remove_objects(Config.MINIO_BUCKET, files) log.debug("Removing clk filters from redis cache") remove_from_cache(dp_ids[0]) remove_from_cache(dp_ids[1]) # Complete the run log.info("Marking run as complete") mark_run_complete.delay(run_id, aggregate_comparisons.get_serialized_span()) else: solver_task.delay(result_filename, project_id, run_id, lenf1, lenf2, aggregate_comparisons.get_serialized_span())
def save_and_permute(similarity_result, project_id, run_id, parent_span): log = logger.bind(pid=project_id, run_id=run_id) log.debug("Saving and possibly permuting data") groups = similarity_result['groups'] # Note Postgres requires JSON object keys to be strings # Celery actually converts the json arguments in the same way with DBConn() as db: result_type = get_project_column(db, project_id, 'result_type') if result_type == "groups": # Save the raw groups log.debug("Saving the groups in the DB") result_id = insert_mapping_result(db, run_id, groups) else: # Turn groups into mapping and save that log.debug("Turning groups into mapping") mapping = groups_to_mapping(groups) log.debug("Saving mappuing in the DB") result_id = insert_mapping_result(db, run_id, mapping) dp_ids = get_dataprovider_ids(db, project_id) log.info("Result saved to db with result id {}".format(result_id)) if result_type == "permutations": log.debug("Submitting job to permute mapping") dataset0_size, dataset1_size = similarity_result['datasetSizes'] permute_mapping_data.apply_async( (project_id, run_id, dataset0_size, dataset1_size, save_and_permute.get_serialized_span())) else: log.debug("Mark job as complete") mark_run_complete.delay(run_id, save_and_permute.get_serialized_span()) # Post similarity computation cleanup log.debug("Removing clk filters from redis cache") for dp_id in dp_ids: cache.remove_from_cache(dp_id) calculate_comparison_rate.delay()
def aggregate_comparisons(similarity_result_files, project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) if similarity_result_files is None: raise TypeError("Inappropriate argument type - missing results files.") files = [] for res in similarity_result_files: if res is None: log.warning( "Missing results during aggregation. Stopping processing.") raise TypeError( "Inappropriate argument type - results missing at aggregation step." ) num, filesize, filename = res if num: assert filesize is not None assert filename is not None files.append((num, filesize, filename)) else: assert filesize is None assert filename is None heapq.heapify(files) log.debug(f"Aggregating result chunks from {len(files)} files, " f"total size: {sum(map(operator.itemgetter(1), files))}") mc = connect_to_object_store() while len(files) > 1: file0 = heapq.heappop(files) file1 = heapq.heappop(files) merged_file = _merge_files(mc, log, file0, file1) heapq.heappush(files, merged_file) if not files: # No results. Let's chuck in an empty file. empty_file = _put_placeholder_empty_file(mc, log) files.append(empty_file) (merged_num, merged_filesize, merged_filename), = files log.info(f"Similarity score results in {merged_filename} in bucket " f"{Config.MINIO_BUCKET} take up {merged_filesize} bytes.") with DBConn() as db: result_type = get_project_column(db, project_id, 'result_type') result_id = insert_similarity_score_file(db, run_id, merged_filename) log.debug(f"Saved path to similarity scores file to db with id " f"{result_id}") if result_type == "similarity_scores": # Post similarity computation cleanup dp_ids = get_dataprovider_ids(db, project_id) else: # we promote the run to the next stage progress_stage(db, run_id) dataset_sizes = get_project_dataset_sizes(db, project_id) # DB now committed, we can fire off tasks that depend on the new db state if result_type == "similarity_scores": log.debug("Removing clk filters from redis cache") for dp_id in dp_ids: remove_from_cache(dp_id) # Complete the run log.info("Marking run as complete") mark_run_complete.delay(run_id, aggregate_comparisons.get_serialized_span()) else: solver_task.delay(merged_filename, project_id, run_id, dataset_sizes, aggregate_comparisons.get_serialized_span())
def permute_mapping_data(project_id, run_id, len_filters1, len_filters2, parent_span): """ Task which will create a permutation after a mapping has been completed. :param project_id: The project resource id :param run_id: The run id :param len_filters1: :param len_filters2: """ log = logger.bind(pid=project_id, run_id=run_id) with DBConn() as conn: mapping_str = get_run_result(conn, run_id) # Convert to int: int mapping = {int(k): int(mapping_str[k]) for k in mapping_str} log.info("Creating random permutations") log.debug( "Entities in dataset A: {}, Entities in dataset B: {}".format( len_filters1, len_filters2)) """ Pack all the entities that match in the **same** random locations in both permutations. Then fill in all the gaps! Dictionaries first, then converted to lists. """ smaller_dataset_size = min(len_filters1, len_filters2) log.debug("Smaller dataset size is {}".format(smaller_dataset_size)) number_in_common = len(mapping) a_permutation = {} # Should be length of filters1 b_permutation = {} # length of filters2 # By default mark all rows as NOT included in the mask mask = {i: False for i in range(smaller_dataset_size)} # start with all the possible indexes remaining_new_indexes = list(range(smaller_dataset_size)) log.info("Shuffling indices for matched entities") random.shuffle(remaining_new_indexes) log.info("Assigning random indexes for {} matched entities".format( number_in_common)) for mapping_number, a_index in enumerate(mapping): b_index = mapping[a_index] # Choose the index in the new mapping (randomly) mapping_index = remaining_new_indexes[mapping_number] a_permutation[a_index] = mapping_index b_permutation[b_index] = mapping_index # Mark the row included in the mask mask[mapping_index] = True remaining_new_indexes = set(remaining_new_indexes[number_in_common:]) log.info("Randomly adding all non matched entities") # Note the a and b datasets could be of different size. # At this point, both still have to use the remaining_new_indexes, and any # indexes that go over the number_in_common remaining_a_values = list( set(range(smaller_dataset_size, len_filters1)).union(remaining_new_indexes)) remaining_b_values = list( set(range(smaller_dataset_size, len_filters2)).union(remaining_new_indexes)) log.debug("Shuffle the remaining indices") random.shuffle(remaining_a_values) random.shuffle(remaining_b_values) # For every element in a's permutation for a_index in range(len_filters1): # Check if it is not already present if a_index not in a_permutation: # This index isn't yet mapped # choose and remove a random index from the extended list of those that remain # note this "could" be the same row (a NOP 1-1 permutation) mapping_index = remaining_a_values.pop() a_permutation[a_index] = mapping_index # For every eventual element in a's permutation for b_index in range(len_filters2): # Check if it is not already present if b_index not in b_permutation: # This index isn't yet mapped # choose and remove a random index from the extended list of those that remain # note this "could" be the same row (a NOP 1-1 permutation) mapping_index = remaining_b_values.pop() b_permutation[b_index] = mapping_index log.debug("Completed creating new permutations for each party") dp_ids = get_dataprovider_ids(conn, project_id) for i, permutation in enumerate([a_permutation, b_permutation]): # We convert here because celery and dicts with int keys don't play nice perm_list = convert_mapping_to_list(permutation) log.debug("Saving a permutation") insert_permutation(conn, dp_ids[i], run_id, perm_list) log.debug("Raw permutation data saved. Now saving raw mask") # Convert the mask dict to a list of 0/1 ints mask_list = convert_mapping_to_list( {int(key): 1 if value else 0 for key, value in mask.items()}) log.debug("Saving the mask") insert_permutation_mask(conn, project_id, run_id, mask_list) log.info("Mask saved") log.info("Committing database transaction") mark_run_complete.delay(run_id, permute_mapping_data.get_serialized_span())