Example #1
0
def save_and_permute(similarity_result, project_id, run_id, parent_span):
    log = logger.bind(pid=project_id, run_id=run_id)
    log.debug("Saving and possibly permuting data")
    mapping = similarity_result['mapping']

    # Note Postgres requires JSON object keys to be strings
    # Celery actually converts the json arguments in the same way

    with DBConn() as db:
        result_type = get_project_column(db, project_id, 'result_type')

        # Just save the raw "mapping"
        log.debug("Saving the resulting map data to the db")
        result_id = insert_mapping_result(db, run_id, mapping)
        dp_ids = get_dataprovider_ids(db, project_id)

    log.info("Mapping result saved to db with result id {}".format(result_id))

    if result_type == "permutations":
        log.debug("Submitting job to permute mapping")
        permute_mapping_data.apply_async(
            (project_id, run_id,
             similarity_result['lenf1'], similarity_result['lenf2'],
             save_and_permute.get_serialized_span()))
    else:
        log.debug("Mark mapping job as complete")
        mark_run_complete.delay(run_id, save_and_permute.get_serialized_span())

    # Post similarity computation cleanup
    log.debug("Removing clk filters from redis cache")

    for dp_id in dp_ids:
        cache.remove_from_cache(dp_id)
    calculate_comparison_rate.delay()
Example #2
0
def aggregate_comparisons(similarity_result_files, project_id, run_id, parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    if similarity_result_files is None: return
    mc = connect_to_object_store()
    files = []
    data_size = 0

    for num, filename in similarity_result_files:
        if num > 0:
            files.append(filename)
            data_size += mc.stat_object(Config.MINIO_BUCKET, filename).size

    log.debug("Aggregating result chunks from {} files, total size: {}".format(
        len(files), fmt_bytes(data_size)))

    result_file_stream_generator = (mc.get_object(Config.MINIO_BUCKET, result_filename) for result_filename in files)

    log.info("Similarity score results are {}".format(fmt_bytes(data_size)))
    result_stream = chain_streams(result_file_stream_generator)

    with DBConn() as db:
        result_type = get_project_column(db, project_id, 'result_type')

        # Note: Storing the similarity scores for all result types
        result_filename = store_similarity_scores(result_stream, run_id, data_size, db)

        if result_type == "similarity_scores":
            # Post similarity computation cleanup
            dp_ids = get_dataprovider_ids(db, project_id)

        else:
            # we promote the run to the next stage
            progress_stage(db, run_id)
            lenf1, lenf2 = get_project_dataset_sizes(db, project_id)

    # DB now committed, we can fire off tasks that depend on the new db state
    if result_type == "similarity_scores":
        log.info("Deleting intermediate similarity score files from object store")
        mc.remove_objects(Config.MINIO_BUCKET, files)
        log.debug("Removing clk filters from redis cache")
        remove_from_cache(dp_ids[0])
        remove_from_cache(dp_ids[1])

        # Complete the run
        log.info("Marking run as complete")
        mark_run_complete.delay(run_id, aggregate_comparisons.get_serialized_span())
    else:
        solver_task.delay(result_filename, project_id, run_id, lenf1, lenf2, aggregate_comparisons.get_serialized_span())
def save_and_permute(similarity_result, project_id, run_id, parent_span):
    log = logger.bind(pid=project_id, run_id=run_id)
    log.debug("Saving and possibly permuting data")
    groups = similarity_result['groups']

    # Note Postgres requires JSON object keys to be strings
    # Celery actually converts the json arguments in the same way

    with DBConn() as db:
        result_type = get_project_column(db, project_id, 'result_type')

        if result_type == "groups":
            # Save the raw groups
            log.debug("Saving the groups in the DB")
            result_id = insert_mapping_result(db, run_id, groups)
        else:
            # Turn groups into mapping and save that
            log.debug("Turning groups into mapping")
            mapping = groups_to_mapping(groups)
            log.debug("Saving mappuing in the DB")
            result_id = insert_mapping_result(db, run_id, mapping)

        dp_ids = get_dataprovider_ids(db, project_id)

    log.info("Result saved to db with result id {}".format(result_id))

    if result_type == "permutations":
        log.debug("Submitting job to permute mapping")
        dataset0_size, dataset1_size = similarity_result['datasetSizes']
        permute_mapping_data.apply_async(
            (project_id, run_id, dataset0_size, dataset1_size,
             save_and_permute.get_serialized_span()))
    else:
        log.debug("Mark job as complete")
        mark_run_complete.delay(run_id, save_and_permute.get_serialized_span())

    # Post similarity computation cleanup
    log.debug("Removing clk filters from redis cache")

    for dp_id in dp_ids:
        cache.remove_from_cache(dp_id)
    calculate_comparison_rate.delay()
Example #4
0
def aggregate_comparisons(similarity_result_files,
                          project_id,
                          run_id,
                          parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    if similarity_result_files is None:
        raise TypeError("Inappropriate argument type - missing results files.")

    files = []
    for res in similarity_result_files:
        if res is None:
            log.warning(
                "Missing results during aggregation. Stopping processing.")
            raise TypeError(
                "Inappropriate argument type - results missing at aggregation step."
            )
        num, filesize, filename = res
        if num:
            assert filesize is not None
            assert filename is not None
            files.append((num, filesize, filename))
        else:
            assert filesize is None
            assert filename is None
    heapq.heapify(files)

    log.debug(f"Aggregating result chunks from {len(files)} files, "
              f"total size: {sum(map(operator.itemgetter(1), files))}")

    mc = connect_to_object_store()
    while len(files) > 1:
        file0 = heapq.heappop(files)
        file1 = heapq.heappop(files)
        merged_file = _merge_files(mc, log, file0, file1)
        heapq.heappush(files, merged_file)

    if not files:
        # No results. Let's chuck in an empty file.
        empty_file = _put_placeholder_empty_file(mc, log)
        files.append(empty_file)

    (merged_num, merged_filesize, merged_filename), = files
    log.info(f"Similarity score results in {merged_filename} in bucket "
             f"{Config.MINIO_BUCKET} take up {merged_filesize} bytes.")

    with DBConn() as db:
        result_type = get_project_column(db, project_id, 'result_type')
        result_id = insert_similarity_score_file(db, run_id, merged_filename)
        log.debug(f"Saved path to similarity scores file to db with id "
                  f"{result_id}")

        if result_type == "similarity_scores":
            # Post similarity computation cleanup
            dp_ids = get_dataprovider_ids(db, project_id)

        else:
            # we promote the run to the next stage
            progress_stage(db, run_id)
            dataset_sizes = get_project_dataset_sizes(db, project_id)

    # DB now committed, we can fire off tasks that depend on the new db state
    if result_type == "similarity_scores":
        log.debug("Removing clk filters from redis cache")
        for dp_id in dp_ids:
            remove_from_cache(dp_id)

        # Complete the run
        log.info("Marking run as complete")
        mark_run_complete.delay(run_id,
                                aggregate_comparisons.get_serialized_span())
    else:
        solver_task.delay(merged_filename, project_id, run_id, dataset_sizes,
                          aggregate_comparisons.get_serialized_span())
Example #5
0
def permute_mapping_data(project_id, run_id, len_filters1, len_filters2,
                         parent_span):
    """
    Task which will create a permutation after a mapping has been completed.

    :param project_id: The project resource id
    :param run_id: The run id
    :param len_filters1:
    :param len_filters2:

    """
    log = logger.bind(pid=project_id, run_id=run_id)

    with DBConn() as conn:

        mapping_str = get_run_result(conn, run_id)

        # Convert to int: int
        mapping = {int(k): int(mapping_str[k]) for k in mapping_str}

        log.info("Creating random permutations")
        log.debug(
            "Entities in dataset A: {}, Entities in dataset B: {}".format(
                len_filters1, len_filters2))
        """
        Pack all the entities that match in the **same** random locations in both permutations.
        Then fill in all the gaps!
    
        Dictionaries first, then converted to lists.
        """
        smaller_dataset_size = min(len_filters1, len_filters2)
        log.debug("Smaller dataset size is {}".format(smaller_dataset_size))
        number_in_common = len(mapping)
        a_permutation = {}  # Should be length of filters1
        b_permutation = {}  # length of filters2

        # By default mark all rows as NOT included in the mask
        mask = {i: False for i in range(smaller_dataset_size)}

        # start with all the possible indexes
        remaining_new_indexes = list(range(smaller_dataset_size))
        log.info("Shuffling indices for matched entities")
        random.shuffle(remaining_new_indexes)
        log.info("Assigning random indexes for {} matched entities".format(
            number_in_common))

        for mapping_number, a_index in enumerate(mapping):
            b_index = mapping[a_index]

            # Choose the index in the new mapping (randomly)
            mapping_index = remaining_new_indexes[mapping_number]

            a_permutation[a_index] = mapping_index
            b_permutation[b_index] = mapping_index

            # Mark the row included in the mask
            mask[mapping_index] = True

        remaining_new_indexes = set(remaining_new_indexes[number_in_common:])
        log.info("Randomly adding all non matched entities")

        # Note the a and b datasets could be of different size.
        # At this point, both still have to use the remaining_new_indexes, and any
        # indexes that go over the number_in_common
        remaining_a_values = list(
            set(range(smaller_dataset_size,
                      len_filters1)).union(remaining_new_indexes))
        remaining_b_values = list(
            set(range(smaller_dataset_size,
                      len_filters2)).union(remaining_new_indexes))

        log.debug("Shuffle the remaining indices")
        random.shuffle(remaining_a_values)
        random.shuffle(remaining_b_values)

        # For every element in a's permutation
        for a_index in range(len_filters1):
            # Check if it is not already present
            if a_index not in a_permutation:
                # This index isn't yet mapped

                # choose and remove a random index from the extended list of those that remain
                # note this "could" be the same row (a NOP 1-1 permutation)
                mapping_index = remaining_a_values.pop()

                a_permutation[a_index] = mapping_index

        # For every eventual element in a's permutation
        for b_index in range(len_filters2):
            # Check if it is not already present
            if b_index not in b_permutation:
                # This index isn't yet mapped

                # choose and remove a random index from the extended list of those that remain
                # note this "could" be the same row (a NOP 1-1 permutation)
                mapping_index = remaining_b_values.pop()
                b_permutation[b_index] = mapping_index

        log.debug("Completed creating new permutations for each party")

        dp_ids = get_dataprovider_ids(conn, project_id)

        for i, permutation in enumerate([a_permutation, b_permutation]):
            # We convert here because celery and dicts with int keys don't play nice

            perm_list = convert_mapping_to_list(permutation)
            log.debug("Saving a permutation")

            insert_permutation(conn, dp_ids[i], run_id, perm_list)

        log.debug("Raw permutation data saved. Now saving raw mask")

        # Convert the mask dict to a list of 0/1 ints
        mask_list = convert_mapping_to_list(
            {int(key): 1 if value else 0
             for key, value in mask.items()})
        log.debug("Saving the mask")
        insert_permutation_mask(conn, project_id, run_id, mask_list)
        log.info("Mask saved")
        log.info("Committing database transaction")

    mark_run_complete.delay(run_id, permute_mapping_data.get_serialized_span())