Beispiel #1
0
def get_result(dbinstance, project_id, run_id, token):
    result_type = db.get_project_column(dbinstance, project_id, 'result_type')
    auth_token_type = get_authorization_token_type_or_abort(project_id, token)

    if result_type == 'mapping':
        logger.info("Mapping result being returned")
        result = db.get_run_result(dbinstance, run_id)
        return {"mapping": result}

    elif result_type == 'groups':
        logger.info("Groups result being returned")
        result = db.get_run_result(dbinstance, run_id)
        return {"groups": result}

    elif result_type == 'similarity_scores':
        logger.info("Similarity result being returned")
        return get_similarity_score_result(dbinstance, run_id)

    elif result_type == 'permutations':
        logger.info("Permutation result being returned")
        return get_permutations_result(project_id, run_id, dbinstance, token,
                                       auth_token_type)
    else:
        logger.warning("Unimplemented result type")
        safe_fail_request(500, message='Project has unknown result type')
Beispiel #2
0
def permute_mapping_data(project_id, run_id, len_filters1, len_filters2,
                         parent_span):
    """
    Task which will create a permutation after a mapping has been completed.

    :param project_id: The project resource id
    :param run_id: The run id
    :param len_filters1:
    :param len_filters2:

    """
    log = logger.bind(pid=project_id, run_id=run_id)

    with DBConn() as conn:

        mapping_str = get_run_result(conn, run_id)

        # Convert to int: int
        mapping = {int(k): int(mapping_str[k]) for k in mapping_str}

        log.info("Creating random permutations")
        log.debug(
            "Entities in dataset A: {}, Entities in dataset B: {}".format(
                len_filters1, len_filters2))
        """
        Pack all the entities that match in the **same** random locations in both permutations.
        Then fill in all the gaps!
    
        Dictionaries first, then converted to lists.
        """
        smaller_dataset_size = min(len_filters1, len_filters2)
        log.debug("Smaller dataset size is {}".format(smaller_dataset_size))
        number_in_common = len(mapping)
        a_permutation = {}  # Should be length of filters1
        b_permutation = {}  # length of filters2

        # By default mark all rows as NOT included in the mask
        mask = {i: False for i in range(smaller_dataset_size)}

        # start with all the possible indexes
        remaining_new_indexes = list(range(smaller_dataset_size))
        log.info("Shuffling indices for matched entities")
        random.shuffle(remaining_new_indexes)
        log.info("Assigning random indexes for {} matched entities".format(
            number_in_common))

        for mapping_number, a_index in enumerate(mapping):
            b_index = mapping[a_index]

            # Choose the index in the new mapping (randomly)
            mapping_index = remaining_new_indexes[mapping_number]

            a_permutation[a_index] = mapping_index
            b_permutation[b_index] = mapping_index

            # Mark the row included in the mask
            mask[mapping_index] = True

        remaining_new_indexes = set(remaining_new_indexes[number_in_common:])
        log.info("Randomly adding all non matched entities")

        # Note the a and b datasets could be of different size.
        # At this point, both still have to use the remaining_new_indexes, and any
        # indexes that go over the number_in_common
        remaining_a_values = list(
            set(range(smaller_dataset_size,
                      len_filters1)).union(remaining_new_indexes))
        remaining_b_values = list(
            set(range(smaller_dataset_size,
                      len_filters2)).union(remaining_new_indexes))

        log.debug("Shuffle the remaining indices")
        random.shuffle(remaining_a_values)
        random.shuffle(remaining_b_values)

        # For every element in a's permutation
        for a_index in range(len_filters1):
            # Check if it is not already present
            if a_index not in a_permutation:
                # This index isn't yet mapped

                # choose and remove a random index from the extended list of those that remain
                # note this "could" be the same row (a NOP 1-1 permutation)
                mapping_index = remaining_a_values.pop()

                a_permutation[a_index] = mapping_index

        # For every eventual element in a's permutation
        for b_index in range(len_filters2):
            # Check if it is not already present
            if b_index not in b_permutation:
                # This index isn't yet mapped

                # choose and remove a random index from the extended list of those that remain
                # note this "could" be the same row (a NOP 1-1 permutation)
                mapping_index = remaining_b_values.pop()
                b_permutation[b_index] = mapping_index

        log.debug("Completed creating new permutations for each party")

        dp_ids = get_dataprovider_ids(conn, project_id)

        for i, permutation in enumerate([a_permutation, b_permutation]):
            # We convert here because celery and dicts with int keys don't play nice

            perm_list = convert_mapping_to_list(permutation)
            log.debug("Saving a permutation")

            insert_permutation(conn, dp_ids[i], run_id, perm_list)

        log.debug("Raw permutation data saved. Now saving raw mask")

        # Convert the mask dict to a list of 0/1 ints
        mask_list = convert_mapping_to_list(
            {int(key): 1 if value else 0
             for key, value in mask.items()})
        log.debug("Saving the mask")
        insert_permutation_mask(conn, project_id, run_id, mask_list)
        log.info("Mask saved")
        log.info("Committing database transaction")

    mark_run_complete.delay(run_id, permute_mapping_data.get_serialized_span())