Ejemplo n.º 1
0
def get_similarity_scores(filename):
    """
    Read a CSV file from the object store containing the similarity scores and return
    a response that will stream the similarity scores.

    :param filename: name of the CSV file, obtained from the `similarity_scores` table
    :return: the similarity scores in a streaming JSON response.
    """

    mc = connect_to_object_store()

    details = mc.stat_object(config.MINIO_BUCKET, filename)
    logger.info("Starting download stream of similarity scores.",
                filename=filename,
                filesize=details.size)

    try:
        sims_data_stream = mc.get_object(config.MINIO_BUCKET, filename)
        # TODO: Below is an Anonlink 'private' API. It should be made
        # public.
        sims_iter, *_ = anonlink.serialization._load_to_iterable(
            sims_data_stream)

        return Response(generate_scores(sims_iter),
                        mimetype='application/json')

    except urllib3.exceptions.ResponseError:
        logger.warning(
            "Attempt to read the similarity scores file failed with an error response.",
            filename=filename)
        safe_fail_request(500, "Failed to retrieve similarity scores")
Ejemplo n.º 2
0
def get_similarity_scores(filename):
    """
    Read a CSV file from the object store containing the similarity scores and return
    a response that will stream the similarity scores.

    :param filename: name of the CSV file, obtained from the `similarity_scores` table
    :return: the similarity scores in a streaming JSON response.
    """

    mc = connect_to_object_store()

    details = mc.stat_object(config.MINIO_BUCKET, filename)
    logger.info("Starting download stream of similarity scores.",
                filename=filename,
                filesize=details.size)

    try:
        candidate_pair_binary_stream = mc.get_object(config.MINIO_BUCKET,
                                                     filename)

        return Response(generate_scores(candidate_pair_binary_stream),
                        mimetype='application/json')

    except urllib3.exceptions.ResponseError:
        logger.warning(
            "Attempt to read the similarity scores file failed with an error response.",
            filename=filename)
        safe_fail_request(500, "Failed to retrieve similarity scores")
Ejemplo n.º 3
0
def get_deserialized_filter(dp_id):
    """Cached, deserialized version.
    """
    logger.debug("Getting filters")
    key = 'clk-pkl-{}'.format(dp_id)
    r = connect_to_redis(read_only=True)

    # Check if this dp_id is already saved in redis?
    if r.exists(key):
        logger.debug("returning filters from cache")
        return pickle.loads(r.get(key))
    else:
        logger.debug("Looking up popcounts and filename from database")
        with DBConn() as db:
            serialized_filters_file, encoding_size = get_filter_metadata(
                db, dp_id)
        mc = connect_to_object_store()
        logger.debug("Getting filters from object store")

        # Note this uses already calculated popcounts unlike
        # serialization.deserialize_filters()
        raw_data_response = mc.get_object(config.MINIO_BUCKET,
                                          serialized_filters_file)
        python_filters = binary_unpack_filters(
            raw_data_response.stream(encoding_size))

        set_deserialized_filter(dp_id, python_filters)
        return python_filters
def delete_minio_objects(filenames, project_id):
    log = logger.bind(pid=project_id)
    mc = connect_to_object_store()
    log.info(f"Deleting {len(filenames)} files from object store")
    try:
        mc.remove_objects(Config.MINIO_BUCKET, filenames)
    except MinioError as e:
        log.warning(
            f"Error occurred while removing object {filenames}. Ignoring.")
Ejemplo n.º 5
0
def upload_clk_data_binary(project_id, dp_id, raw_stream, count, size=128):
    """
    Save the user provided raw CLK data.

    """
    receipt_token = generate_code()
    filename = Config.BIN_FILENAME_FMT.format(receipt_token)
    # Set the state to 'pending' in the bloomingdata table
    with DBConn() as conn:
        db.insert_encoding_metadata(conn, filename, dp_id, receipt_token,
                                    count)
        db.update_encoding_metadata_set_encoding_size(conn, dp_id, size)
    logger.info(
        f"Storing supplied binary clks of individual size {size} in file: {filename}"
    )

    num_bytes = count * (size + 6)

    logger.debug(
        "Directly storing binary file with index, base64 encoded CLK, popcount"
    )

    # Upload to object store
    logger.info(
        f"Uploading {count} binary encodings to object store. Total size: {fmt_bytes(num_bytes)}"
    )
    parent_span = g.flask_tracer.get_span()

    with opentracing.tracer.start_span('save-to-minio',
                                       child_of=parent_span) as span:
        mc = connect_to_object_store()
        try:
            mc.put_object(Config.MINIO_BUCKET,
                          filename,
                          data=raw_stream,
                          length=num_bytes)
        except (minio.error.InvalidSizeError, minio.error.InvalidArgumentError,
                minio.error.ResponseError):
            logger.info(
                "Mismatch between expected stream length and header info")
            raise ValueError(
                "Mismatch between expected stream length and header info")

    with opentracing.tracer.start_span('update-database',
                                       child_of=parent_span) as span:
        with DBConn() as conn:
            db.update_encoding_metadata(conn, filename, dp_id, 'ready')
            db.set_dataprovider_upload_state(conn, dp_id, True)

    # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id):
        logger.info("All parties data present. Scheduling any queued runs")
        check_for_executable_runs.delay(project_id,
                                        serialize_span(parent_span))

    return receipt_token
Ejemplo n.º 6
0
def get_chunk_from_object_store(chunk_info, encoding_size=128):
    mc = connect_to_object_store()
    bit_packed_element_size = binary_format(encoding_size).size
    chunk_length = chunk_info[2] - chunk_info[1]
    chunk_bytes = bit_packed_element_size * chunk_length
    chunk_stream = mc.get_partial_object(
        config.MINIO_BUCKET, chunk_info[0],
        bit_packed_element_size * chunk_info[1], chunk_bytes)

    chunk_data = binary_unpack_filters(chunk_stream, chunk_bytes,
                                       encoding_size)

    return chunk_data, chunk_length
Ejemplo n.º 7
0
def handle_raw_upload(project_id, dp_id, receipt_token, parent_span=None):
    """
    User has uploaded base64 encodings as JSON, this task needs to copy the data into
    our internal binary format.
    """
    log = logger.bind(pid=project_id, dp_id=dp_id)
    log.info("Handling user provided base64 encodings")
    new_child_span = lambda name: handle_raw_upload.tracer.start_active_span(name, child_of=handle_raw_upload.span)
    with DBConn() as db:
        if not check_project_exists(db, project_id):
            log.info("Project deleted, stopping immediately")
            return
        # Get number of blocks + total number of encodings from database
        expected_count, block_count = get_encoding_metadata(db, dp_id)

    log.info(f"Expecting to handle {expected_count} encodings in {block_count} blocks")
    mc = connect_to_object_store()
    input_filename = Config.RAW_FILENAME_FMT.format(receipt_token)
    raw_data = mc.get_object(Config.MINIO_BUCKET, input_filename)

    with new_child_span('upload-encodings-to-db'):
        # stream encodings with block ids from uploaded file
        # convert each encoding to our internal binary format
        # output into database for each block (temp or direct to minio?)
        encoding_size, pipeline = convert_encodings_from_base64_to_binary(stream_json_clksnblocks(raw_data))
        log.info(f"Starting pipeline to store {encoding_size}B sized encodings in database")
        with DBConn() as db:
            store_encodings_in_db(db, dp_id, pipeline, encoding_size)

    log.info(f"Converted uploaded encodings of size {fmt_bytes(encoding_size)} into internal binary format. Number of blocks: {block_count}")

    # As this is the first time we've seen the encoding size actually uploaded from this data provider
    # We check it complies with the project encoding size.
    try:
        check_dataproviders_encoding(project_id, encoding_size)
    except InvalidEncodingError as e:
        log.warning(e.args[0])
        handle_invalid_encoding_data(project_id, dp_id)

    with DBConn() as conn:
        with new_child_span('save-encoding-metadata'):
            # Save the encoding size as metadata for this data provider
            update_encoding_metadata_set_encoding_size(conn, dp_id, encoding_size)
            update_encoding_metadata(conn, None, dp_id, 'ready')

    # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id, check_data_ready=True):
        log.info("All parties' data present. Scheduling any queued runs")
        check_for_executable_runs.delay(project_id, handle_raw_upload.get_serialized_span())
Ejemplo n.º 8
0
def get_chunk_from_object_store(chunk_info, encoding_size=128):
    mc = connect_to_object_store()
    bit_packed_element_size = binary_format(encoding_size).size
    chunk_range_start, chunk_range_stop = chunk_info['range']
    chunk_length = chunk_range_stop - chunk_range_start
    chunk_bytes = bit_packed_element_size * chunk_length
    chunk_stream = mc.get_partial_object(
        config.MINIO_BUCKET, chunk_info['storeFilename'],
        bit_packed_element_size * chunk_range_start, chunk_bytes)

    chunk_data = binary_unpack_filters(
        chunk_stream.stream(bit_packed_element_size), chunk_bytes,
        encoding_size)

    return chunk_data, chunk_length
Ejemplo n.º 9
0
def delete_minio_objects(filenames, project_id, parent_span=None):
    log = logger.bind(pid=project_id)
    mc = connect_to_object_store()
    log.info(f"Deleting {len(filenames)} files from object store")
    try:
        for del_err in mc.remove_objects(Config.MINIO_BUCKET, filenames):
            log.debug("Deletion error: {}".format(del_err))
    except MinioError as e:
        log.warning(
            f"Error occurred while removing object {filenames}. Ignoring.")

    if Config.UPLOAD_OBJECT_STORE_ENABLED:
        log.debug("Deleting everything uploaded to object store for project")
        delete_object_store_folder(mc, Config.UPLOAD_OBJECT_STORE_BUCKET,
                                   f"{project_id}/")
Ejemplo n.º 10
0
def aggregate_comparisons(similarity_result_files, project_id, run_id, parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    if similarity_result_files is None: return
    mc = connect_to_object_store()
    files = []
    data_size = 0

    for num, filename in similarity_result_files:
        if num > 0:
            files.append(filename)
            data_size += mc.stat_object(Config.MINIO_BUCKET, filename).size

    log.debug("Aggregating result chunks from {} files, total size: {}".format(
        len(files), fmt_bytes(data_size)))

    result_file_stream_generator = (mc.get_object(Config.MINIO_BUCKET, result_filename) for result_filename in files)

    log.info("Similarity score results are {}".format(fmt_bytes(data_size)))
    result_stream = chain_streams(result_file_stream_generator)

    with DBConn() as db:
        result_type = get_project_column(db, project_id, 'result_type')

        # Note: Storing the similarity scores for all result types
        result_filename = store_similarity_scores(result_stream, run_id, data_size, db)

        if result_type == "similarity_scores":
            # Post similarity computation cleanup
            dp_ids = get_dataprovider_ids(db, project_id)

        else:
            # we promote the run to the next stage
            progress_stage(db, run_id)
            lenf1, lenf2 = get_project_dataset_sizes(db, project_id)

    # DB now committed, we can fire off tasks that depend on the new db state
    if result_type == "similarity_scores":
        log.info("Deleting intermediate similarity score files from object store")
        mc.remove_objects(Config.MINIO_BUCKET, files)
        log.debug("Removing clk filters from redis cache")
        remove_from_cache(dp_ids[0])
        remove_from_cache(dp_ids[1])

        # Complete the run
        log.info("Marking run as complete")
        mark_run_complete.delay(run_id, aggregate_comparisons.get_serialized_span())
    else:
        solver_task.delay(result_filename, project_id, run_id, lenf1, lenf2, aggregate_comparisons.get_serialized_span())
Ejemplo n.º 11
0
def upload_json_clk_data(dp_id, clk_json, parent_span):
    """
    Convert user provided encodings from json array of base64 data into
    a newline separated file of base64 data.

    Note this implementation is non-streaming.
    """
    if 'clks' not in clk_json or len(clk_json['clks']) < 1:
        safe_fail_request(400, message="Missing CLKs information")

    receipt_token = generate_code()

    filename = Config.RAW_FILENAME_FMT.format(receipt_token)
    logger.info("Storing user {} supplied clks from json".format(dp_id))

    with opentracing.tracer.start_span('clk-splitting',
                                       child_of=parent_span) as span:
        count = len(clk_json['clks'])
        span.set_tag("clks", count)
        data = b''.join(''.join(clk.split('\n')).encode() + b'\n'
                        for clk in clk_json['clks'])

        num_bytes = len(data)
        span.set_tag("num_bytes", num_bytes)
        buffer = BytesIO(data)

    logger.info(
        f"Received {count} encodings. Uploading {fmt_bytes(num_bytes)} to object store"
    )
    with opentracing.tracer.start_span('save-to-quarantine',
                                       child_of=parent_span) as span:
        span.set_tag('filename', filename)
        mc = connect_to_object_store()
        mc.put_object(Config.MINIO_BUCKET,
                      filename,
                      data=buffer,
                      length=num_bytes)

    with opentracing.tracer.start_span('update-db',
                                       child_of=parent_span) as span:
        with DBConn() as conn:
            db.insert_encoding_metadata(conn, filename, dp_id, receipt_token,
                                        count)

    return receipt_token, filename
    def test_temp_credentials_minio(self):

        upload_endpoint = Config.UPLOAD_OBJECT_STORE_SERVER
        bucket_name = "uploads"

        root_mc_client = connect_to_object_store()
        upload_restricted_minio_client = connect_to_upload_object_store()
        if not root_mc_client.bucket_exists(bucket_name):
            root_mc_client.make_bucket(bucket_name)

        with pytest.raises(minio.error.AccessDenied):
            upload_restricted_minio_client.list_buckets()

        # Should be able to put an object though
        upload_restricted_minio_client.put_object(bucket_name,
                                                  'testobject',
                                                  io.BytesIO(b'data'),
                                                  length=4)

        credentials_provider = AssumeRoleProvider(
            upload_restricted_minio_client, Policy=restricted_upload_policy)
        temp_creds = Credentials(provider=credentials_provider)

        newly_restricted_mc_client = Minio(upload_endpoint,
                                           credentials=temp_creds,
                                           region='us-east-1',
                                           secure=False)

        with pytest.raises(minio.error.AccessDenied):
            newly_restricted_mc_client.list_buckets()

        # Note this put object worked with the earlier credentials
        # But should fail if we have applied the more restrictive policy
        with pytest.raises(minio.error.AccessDenied):
            newly_restricted_mc_client.put_object(bucket_name,
                                                  'testobject2',
                                                  io.BytesIO(b'data'),
                                                  length=4)

        # this path is allowed in the policy however
        newly_restricted_mc_client.put_object(bucket_name,
                                              '2020/testobject',
                                              io.BytesIO(b'data'),
                                              length=4)
Ejemplo n.º 13
0
def solver_task(similarity_scores_filename, project_id, run_id, dataset_sizes,
                parent_span):
    log = logger.bind(pid=project_id, run_id=run_id)
    mc = connect_to_object_store()
    solver_task.span.log_kv({
        'datasetSizes': dataset_sizes,
        'filename': similarity_scores_filename
    })
    score_file = mc.get_object(config.MINIO_BUCKET, similarity_scores_filename)
    log.debug("Creating python sparse matrix from bytes data")
    candidate_pairs = anonlink.serialization.load_candidate_pairs(score_file)
    log.info("Calculating the optimal mapping from similarity matrix")

    groups = anonlink.solving.greedy_solve(candidate_pairs)

    log.info("Entity groups have been computed")

    res = {"groups": groups, "datasetSizes": dataset_sizes}
    save_and_permute.delay(res, project_id, run_id,
                           solver_task.get_serialized_span())
Ejemplo n.º 14
0
def solver_task(similarity_scores_filename, project_id, run_id, dataset_sizes,
                parent_span):
    log = logger.bind(pid=project_id, run_id=run_id)
    mc = connect_to_object_store()
    solver_task.span.log_kv({
        'datasetSizes': dataset_sizes,
        'filename': similarity_scores_filename
    })
    score_file = mc.get_object(config.MINIO_BUCKET, similarity_scores_filename)
    log.debug("Creating python sparse matrix from bytes data")
    candidate_pairs_with_duplicates = anonlink.serialization.load_candidate_pairs(
        score_file)
    similarity_scores, (dset_is0,
                        dset_is1), (rec_is0,
                                    rec_is1) = candidate_pairs_with_duplicates

    log.info(
        f"Number of candidate pairs before deduplication: {len(candidate_pairs_with_duplicates[0])}"
    )
    if len(candidate_pairs_with_duplicates[0]) > 0:
        # TODO use public interface when available
        # https://github.com/data61/anonlink/issues/271
        candidate_pairs = _merge_similarities(
            [zip(similarity_scores, dset_is0, dset_is1, rec_is0, rec_is1)],
            k=None)
        log.info(
            f"Number of candidate pairs after deduplication: {len(candidate_pairs[0])}"
        )

        log.info("Calculating the optimal mapping from similarity matrix")
        groups = anonlink.solving.greedy_solve(candidate_pairs)
    else:
        groups = []

    log.info("Entity groups have been computed")

    res = {"groups": groups, "datasetSizes": dataset_sizes}
    save_and_permute.delay(res, project_id, run_id,
                           solver_task.get_serialized_span())
Ejemplo n.º 15
0
def solver_task(similarity_scores_filename, project_id, run_id, lenf1, lenf2,
                parent_span):
    log = logger.bind(pid=project_id, run_id=run_id)
    mc = connect_to_object_store()
    solver_task.span.log_kv({
        'lenf1': lenf1,
        'lenf2': lenf2,
        'filename': similarity_scores_filename
    })
    score_file = mc.get_object(config.MINIO_BUCKET, similarity_scores_filename)
    log.debug("Creating python sparse matrix from bytes data")
    sparse_matrix = similarity_matrix_from_csv_bytes(score_file.data)
    log.info("Calculating the optimal mapping from similarity matrix")
    mapping = anonlink.entitymatch.greedy_solver(sparse_matrix)

    log.debug("Converting all indices to strings")
    for key in mapping:
        mapping[key] = str(mapping[key])

    log.info("Entity mapping has been computed")

    res = {"mapping": mapping, "lenf1": lenf1, "lenf2": lenf2}
    save_and_permute.delay(res, project_id, run_id,
                           solver_task.get_serialized_span())
Ejemplo n.º 16
0
def handle_encoding_upload_json(project_id, dp_id, clk_json, receipt_token,
                                uses_blocking, parent_span):
    """
    Take user provided upload information - accepting multiple formats - and eventually
    injest into the database.

    Encodings uploaded directly in the JSON are first quarantined in the object store,
    and a background task deserializes them.

    Encodings that are in an object store are streamed directly into the database by
    a background task.
    """
    log = logger.bind(pid=project_id)
    log.info("Checking json is consistent")
    try:
        abort_if_inconsistent_upload(uses_blocking, clk_json)
    except ValueError as e:
        safe_fail_request(403, e.args[0])

    if "encodings" in clk_json and 'file' in clk_json['encodings']:
        # external encodings
        log.info("External encodings uploaded")
        encoding_object_info = clk_json['encodings']['file']
        object_name = encoding_object_info['path']
        _check_object_path_allowed(project_id, dp_id, object_name, log)

        encoding_credentials = clk_json['encodings'].get('credentials')
        # Schedule a background task to pull the encodings from the object store
        # This background task updates the database with encoding metadata assuming
        # that there are no blocks.
        if 'blocks' not in clk_json:
            log.info("scheduling task to pull encodings from object store")
            pull_external_data_encodings_only.delay(
                project_id,
                dp_id,
                encoding_object_info,
                encoding_credentials,
                receipt_token,
                parent_span=serialize_span(parent_span))
        else:
            # Need to deal with both encodings and blocks
            if 'file' in clk_json['blocks']:
                object_name = clk_json['blocks']['file']['path']
                _check_object_path_allowed(project_id, dp_id, object_name, log)
                # Blocks are in an external file
                blocks_object_info = clk_json['blocks']['file']
                blocks_credentials = clk_json['blocks'].get('credentials')
                log.info(
                    "scheduling task to pull both encodings and blocking data from object store"
                )
                pull_external_data.delay(
                    project_id,
                    dp_id,
                    encoding_object_info,
                    encoding_credentials,
                    blocks_object_info,
                    blocks_credentials,
                    receipt_token,
                    parent_span=serialize_span(parent_span))
            else:
                raise NotImplementedError(
                    "Don't currently handle combination of external encodings and blocks"
                )

        return

    # Convert uploaded JSON to common schema.
    #
    # The original JSON API simply accepted "clks", then came a combined encoding and
    # blocking API expecting the top level element "clknblocks". Finally an API that
    # specifies both "encodings" and "blocks" independently at the top level.
    #
    # We rewrite all into the "clknblocks" format.
    if "encodings" in clk_json:
        logger.debug(
            "converting from 'encodings' & 'blocks' format to 'clknblocks'")
        clk_json = convert_encoding_upload_to_clknblock(clk_json)

    is_valid_clks = not uses_blocking and 'clks' in clk_json
    element = 'clks' if is_valid_clks else 'clknblocks'

    if len(clk_json[element]) < 1:
        safe_fail_request(400, message="Missing CLKs information")

    filename = Config.RAW_FILENAME_FMT.format(receipt_token)
    logger.info("Storing user {} supplied {} from json".format(dp_id, element))

    with opentracing.tracer.start_span('splitting-json-clks',
                                       child_of=parent_span) as span:
        encoding_count = len(clk_json[element])
        span.set_tag(element, encoding_count)
        logger.debug(f"Received {encoding_count} {element}")

    if element == 'clks':
        logger.info("Rewriting provided json into clknsblocks format")
        clk_json = convert_clks_to_clknblocks(clk_json)
        element = 'clknblocks'

    logger.info("Counting block sizes and number of blocks")
    # {'clknblocks': [['UG9vcA==', '001', '211'], [...]]}
    block_sizes = {}
    for _, *elements_blocks in clk_json[element]:
        for el_block in elements_blocks:
            block_sizes[el_block] = block_sizes.setdefault(el_block, 0) + 1
    block_count = len(block_sizes)

    logger.info(f"Received {encoding_count} encodings in {block_count} blocks")
    for block in block_sizes:
        logger.info(f"Block {block} has {block_sizes[block]} elements")

    # write clk_json into a temp file
    tmp = tempfile.NamedTemporaryFile(mode='w')
    json.dump(clk_json, tmp)
    tmp.flush()
    with opentracing.tracer.start_span('save-clk-file-to-quarantine',
                                       child_of=parent_span) as span:
        span.set_tag('filename', filename)
        mc = connect_to_object_store()
        mc.fput_object(Config.MINIO_BUCKET,
                       filename,
                       tmp.name,
                       content_type='application/json')
    logger.info('Saved uploaded {} JSON to file {} in object store.'.format(
        element.upper(), filename))

    with opentracing.tracer.start_span('update-encoding-metadata',
                                       child_of=parent_span):
        with DBConn() as conn:
            db.insert_encoding_metadata(conn, filename, dp_id, receipt_token,
                                        encoding_count, block_count)
            db.insert_blocking_metadata(conn, dp_id, block_sizes)

    # Schedule a task to deserialize the encodings
    handle_raw_upload.delay(project_id,
                            dp_id,
                            receipt_token,
                            parent_span=serialize_span(parent_span))
Ejemplo n.º 17
0
def handle_raw_upload(project_id, dp_id, receipt_token, parent_span=None):
    # User has uploaded base64 encodings as JSON
    log = logger.bind(pid=project_id, dp_id=dp_id)
    log.info("Handling user provided base64 encodings")

    with DBConn() as db:
        if not check_project_exists(db, project_id):
            log.info("Project deleted, stopping immediately")
            return
        expected_count = get_number_of_hashes(db, dp_id)

    log.info(f"Expecting to handle {expected_count} encodings")
    mc = connect_to_object_store()

    # Input file is line separated base64 record encodings.
    raw_file = Config.RAW_FILENAME_FMT.format(receipt_token)
    raw_data_response = mc.get_object(Config.MINIO_BUCKET, raw_file)

    # Set up streaming processing pipeline
    buffered_stream = iterable_to_stream(raw_data_response.stream())
    text_stream = io.TextIOWrapper(buffered_stream, newline='\n')

    clkcounts = []

    def filter_generator():
        log.debug("Deserializing json filters")
        first_encoding_size = None
        for i, line in enumerate(text_stream):
            ba = deserialize_bitarray(line)
            yield (ba, i, ba.count())
            clkcounts.append(ba.count())
            encsize = len(ba)
            if i == 0:
                first_encoding_size = encsize
            if encsize != first_encoding_size:
                raise ValueError("Encodings were not all the same size")

        log.info(f"Processed {len(clkcounts)} hashes")

    # We peek at the first element as we need the encoding size
    # for the ret of our processing pipeline
    python_filters = more_itertools.peekable(filter_generator())
    # Note the len of a bitarray is returned in bits but we require
    # this to be a multiple of 8 so we use bytes.
    uploaded_encoding_size = len(python_filters.peek()[0]) // 8

    # This is the first time we've seen the encoding size from this data provider
    try:
        check_dataproviders_encoding(project_id, uploaded_encoding_size)
    except InvalidEncodingError as e:
        log.warning(e.args[0])
        handle_invalid_encoding_data(project_id, dp_id)

    with DBConn() as db:
        # Save the encoding size as metadata
        update_encoding_metadata_set_encoding_size(db, dp_id,
                                                   uploaded_encoding_size)

    # Output file is our custom binary packed file
    filename = Config.BIN_FILENAME_FMT.format(receipt_token)
    bit_packed_element_size = binary_format(uploaded_encoding_size).size
    num_bytes = expected_count * bit_packed_element_size

    # If small enough preload the data into our redis cache
    if expected_count < Config.ENTITY_CACHE_THRESHOLD:
        log.info("Caching pickled clk data")
        python_filters = list(python_filters)
        cache.set_deserialized_filter(dp_id, python_filters)
    else:
        log.info("Not caching clk data as it is too large")

    packed_filters = binary_pack_filters(python_filters,
                                         uploaded_encoding_size)
    packed_filter_stream = iterable_to_stream(packed_filters)

    # Upload to object store
    log.info(
        f"Uploading {expected_count} encodings of size {uploaded_encoding_size} "
        + f"to object store. Total Size: {fmt_bytes(num_bytes)}")
    mc.put_object(Config.MINIO_BUCKET,
                  filename,
                  data=packed_filter_stream,
                  length=num_bytes)

    with DBConn() as conn:
        update_encoding_metadata(conn, filename, dp_id, 'ready')

    # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id, check_data_ready=True):
        log.info("All parties' data present. Scheduling any queued runs")
        check_for_executable_runs.delay(
            project_id, handle_raw_upload.get_serialized_span())
Ejemplo n.º 18
0
def aggregate_comparisons(similarity_result_files,
                          project_id,
                          run_id,
                          parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    if similarity_result_files is None:
        raise TypeError("Inappropriate argument type - missing results files.")

    files = []
    for res in similarity_result_files:
        if res is None:
            log.warning(
                "Missing results during aggregation. Stopping processing.")
            raise TypeError(
                "Inappropriate argument type - results missing at aggregation step."
            )
        num, filesize, filename = res
        if num:
            assert filesize is not None
            assert filename is not None
            files.append((num, filesize, filename))
        else:
            assert filesize is None
            assert filename is None
    heapq.heapify(files)

    log.debug(f"Aggregating result chunks from {len(files)} files, "
              f"total size: {sum(map(operator.itemgetter(1), files))}")

    mc = connect_to_object_store()
    while len(files) > 1:
        file0 = heapq.heappop(files)
        file1 = heapq.heappop(files)
        merged_file = _merge_files(mc, log, file0, file1)
        heapq.heappush(files, merged_file)

    if not files:
        # No results. Let's chuck in an empty file.
        empty_file = _put_placeholder_empty_file(mc, log)
        files.append(empty_file)

    (merged_num, merged_filesize, merged_filename), = files
    log.info(f"Similarity score results in {merged_filename} in bucket "
             f"{Config.MINIO_BUCKET} take up {merged_filesize} bytes.")

    with DBConn() as db:
        result_type = get_project_column(db, project_id, 'result_type')
        result_id = insert_similarity_score_file(db, run_id, merged_filename)
        log.debug(f"Saved path to similarity scores file to db with id "
                  f"{result_id}")

        if result_type == "similarity_scores":
            # Post similarity computation cleanup
            dp_ids = get_dataprovider_ids(db, project_id)

        else:
            # we promote the run to the next stage
            progress_stage(db, run_id)
            dataset_sizes = get_project_dataset_sizes(db, project_id)

    # DB now committed, we can fire off tasks that depend on the new db state
    if result_type == "similarity_scores":
        log.debug("Removing clk filters from redis cache")
        for dp_id in dp_ids:
            remove_from_cache(dp_id)

        # Complete the run
        log.info("Marking run as complete")
        mark_run_complete.delay(run_id,
                                aggregate_comparisons.get_serialized_span())
    else:
        solver_task.delay(merged_filename, project_id, run_id, dataset_sizes,
                          aggregate_comparisons.get_serialized_span())
Ejemplo n.º 19
0
def compute_filter_similarity(package,
                              project_id,
                              run_id,
                              threshold,
                              encoding_size,
                              parent_span=None):
    """Compute filter similarity between a chunk of filters in dataprovider 1,
    and a chunk of filters in dataprovider 2.

    :param dict chunk_info:
        A chunk returned by ``anonlink.concurrency.split_to_chunks``.
    :param project_id:
    :param run_id:
    :param threshold:
    :param encoding_size: The size in bytes of each encoded entry
    :param parent_span: A serialized opentracing span context.
    :returns A 3-tuple: (num_results, result size in bytes, results_filename_in_object_store, )
    """
    log = logger.bind(pid=project_id, run_id=run_id)
    task_span = compute_filter_similarity.span

    def new_child_span(name, parent_scope=None):
        if parent_scope is None:
            parent_scope = compute_filter_similarity
        return compute_filter_similarity.tracer.start_active_span(
            name, child_of=parent_scope.span)

    log.debug(f"Computing similarities for {len(package)} chunks of filters")
    log.debug(
        "Checking that the resource exists (in case of run being canceled/deleted)"
    )
    assert_valid_run(project_id, run_id, log)

    #chunk_info_dp1, chunk_info_dp2 = chunk_info
    def reindex_using_encoding_ids(recordarray, encoding_id_list):
        # Map results from "index in chunk" to encoding id.
        return array.array('I', [encoding_id_list[i] for i in recordarray])

    num_results = 0
    num_comparisons = 0
    sim_results = []

    with DBConn() as conn:
        if len(package) > 1:  # multiple full blocks in one package
            with new_child_span(
                    f'fetching-encodings of package of size {len(package)}'):
                package = get_encoding_chunks(conn,
                                              package,
                                              encoding_size=encoding_size)
        else:  # this chunk is all part of one block
            with new_child_span(f'fetching-encodings of package with 1 chunk'):
                chunk_info_dp1, chunk_info_dp2 = package[0]
                chunk_with_ids_dp1, chunk_dp1_size = get_encoding_chunk(
                    conn, chunk_info_dp1, encoding_size)
                entity_ids_dp1, chunk_dp1 = zip(*chunk_with_ids_dp1)
                chunk_info_dp1['encodings'] = chunk_dp1
                chunk_info_dp1['entity_ids'] = entity_ids_dp1
                chunk_with_ids_dp2, chunk_dp2_size = get_encoding_chunk(
                    conn, chunk_info_dp2, encoding_size)
                entity_ids_dp2, chunk_dp2 = zip(*chunk_with_ids_dp2)
                chunk_info_dp2['encodings'] = chunk_dp2
                chunk_info_dp2['entity_ids'] = entity_ids_dp2
    log.debug('All encodings for package are fetched and deserialized')
    log.debug("Calculating filter similarities for work package")

    with new_child_span('comparing-encodings') as parent_scope:
        for chunk_dp1, chunk_dp2 in package:
            enc_dp1 = chunk_dp1['encodings']
            enc_dp1_size = len(enc_dp1)
            enc_dp2 = chunk_dp2['encodings']
            enc_dp2_size = len(enc_dp2)
            assert enc_dp1_size > 0, "Zero sized chunk in dp1"
            assert enc_dp2_size > 0, "Zero sized chunk in dp2"
            try:
                sims, (rec_is0, rec_is1
                       ) = anonlink.similarities.dice_coefficient_accelerated(
                           datasets=(enc_dp1, enc_dp2),
                           threshold=threshold,
                           k=min(enc_dp1_size, enc_dp2_size))
            except NotImplementedError as e:
                log.warning(
                    f"Encodings couldn't be compared using anonlink. {e}")
                return
            rec_is0 = reindex_using_encoding_ids(rec_is0,
                                                 chunk_dp1['entity_ids'])
            rec_is1 = reindex_using_encoding_ids(rec_is1,
                                                 chunk_dp2['entity_ids'])
            num_results += len(sims)
            num_comparisons += enc_dp1_size * enc_dp2_size
            sim_results.append(
                (sims, (rec_is0, rec_is1), chunk_dp1['datasetIndex'],
                 chunk_dp2['datasetIndex']))
        log.debug(
            f'comparison is done. {num_comparisons} comparisons got {num_results} pairs above the threshold'
        )

##### progess reporting
    log.debug('Encoding similarities calculated')

    with new_child_span('update-comparison-progress') as scope:
        # Update the number of comparisons completed
        save_current_progress(num_comparisons, run_id)
        scope.span.log_kv({
            'comparisons': num_comparisons,
            'num_similar': num_results
        })
        log.debug("Comparisons: {}, Links above threshold: {}".format(
            num_comparisons, num_results))


###### results into file into minio
    with new_child_span('save-comparison-results-to-minio'):

        file_iters = []
        file_sizes = []
        for sims, (rec_is0, rec_is1), dp1_ds_idx, dp2_ds_idx in sim_results:
            num_sims = len(sims)

            if num_sims:
                # Make index arrays for serialization
                index_1 = array.array('I', (dp1_ds_idx, )) * num_sims
                index_2 = array.array('I', (dp2_ds_idx, )) * num_sims
                chunk_results = sims, (index_1, index_2), (rec_is0, rec_is1),
                bytes_iter, file_size \
                    = anonlink.serialization.dump_candidate_pairs_iter(chunk_results)
                file_iters.append(iterable_to_stream(bytes_iter))
                file_sizes.append(file_size)

        if len(file_iters) > 1:
            # we need to merge them first into one ordered thingy
            merged_file_iter, merged_file_size \
                = anonlink.serialization.merge_streams_iter(file_iters, sizes=file_sizes)
            merged_file_iter = iterable_to_stream(merged_file_iter)
        elif len(file_iters) == 1:
            merged_file_iter = file_iters[0]
            merged_file_size = file_sizes[0]
        else:
            return 0, None, None

        result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format(
            generate_code(12))
        task_span.log_kv({"edges": num_results})
        log.info("Writing {} intermediate results to file: {}".format(
            num_results, result_filename))

        mc = connect_to_object_store()
        try:
            mc.put_object(Config.MINIO_BUCKET, result_filename,
                          merged_file_iter, merged_file_size)
        except minio.ResponseError as err:
            log.warning("Failed to store result in minio: {}".format(err))
            raise

    return num_results, merged_file_size, result_filename
def pull_external_data(project_id, dp_id,
                       encoding_object_info,
                       blocks_object_info,
                       receipt_token, parent_span=None):
    """
    Load encoding and blocking data from object store.

    - pull blocking map into memory, create blocks in db
    - stream encodings into DB and add encoding + blocks from in memory dict.

    :param project_id: identifier for the project
    :param dp_id:
    :param encoding_object_info: a dictionary contains bucket and path of uploaded encoding
    :param blocks_object_info: a dictionary contains bucket and path of uploaded blocks
    :param receipt_token: token used to insert into database

    """
    env_credentials = parse_minio_credentials({
        'AccessKeyId': config.MINIO_ACCESS_KEY,
        'SecretAccessKey': config.MINIO_SECRET_KEY
    })
    log = logger.bind(pid=project_id, dp_id=dp_id)
    with DBConn() as conn:
        if not check_project_exists(conn, project_id):
            log.info("Project deleted, stopping immediately")
            return

        mc = connect_to_object_store(env_credentials)

    log.debug("Pulling blocking information from object store")
    response = mc.get_object(bucket_name=blocks_object_info['bucket'], object_name=blocks_object_info['path'])
    encoding_to_block_map = json.load(response)['blocks']

    log.debug("Counting the blocks")
    block_sizes = {}
    for encoding_id in encoding_to_block_map:
        _blocks = encoding_to_block_map[encoding_id]
        for block_id in _blocks:
            block_id = str(block_id)
            block_sizes[block_id] = block_sizes.setdefault(block_id, 0) + 1

    block_count = len(block_sizes)
    log.debug(f"Processing {block_count} blocks")

    # stream the encodings
    bucket_name = encoding_object_info['bucket']
    object_name = encoding_object_info['path']

    stat, encodings_stream = stat_and_stream_object(bucket_name, object_name, env_credentials)
    count = int(stat.metadata['X-Amz-Meta-Hash-Count'])
    size = int(stat.metadata['X-Amz-Meta-Hash-Size'])
    log.debug(f"Processing {count} encodings of size {size}")
    assert count == len(encoding_to_block_map), f"Expected {count} encodings in blocks got {len(encoding_to_block_map)}"

    with DBConn() as conn:
        with opentracing.tracer.start_span('update-metadata-db', child_of=parent_span):
            insert_encoding_metadata(conn, None, dp_id, receipt_token, encoding_count=count, block_count=block_count)
            update_encoding_metadata_set_encoding_size(conn, dp_id, size)
        with opentracing.tracer.start_span('create-block-entries-in-db', child_of=parent_span):
            log.debug("Adding blocks to db")
            insert_blocking_metadata(conn, dp_id, block_sizes)

        def ijson_encoding_iterator(encoding_stream):
            binary_formatter = binary_format(size)
            for encoding_id, encoding in zip(range(count), encoding_stream):
                yield (
                    str(encoding_id),
                    binary_formatter.pack(encoding_id, deserialize_bytes(encoding)),
                    encoding_to_block_map[str(encoding_id)]
                    )

        def encoding_iterator(encoding_stream):
            binary_formatter = binary_format(size)
            for encoding_id in range(count):
                yield (
                    str(encoding_id),
                    binary_formatter.pack(encoding_id, encoding_stream.read(size)),
                    encoding_to_block_map[str(encoding_id)]
                    )

        if object_name.endswith('.json'):
            encodings_stream = ijson.items(io.BytesIO(encodings_stream.data), 'clks.item')
            encoding_generator = ijson_encoding_iterator(encodings_stream)
        else:
            encoding_generator = encoding_iterator(encodings_stream)

        with opentracing.tracer.start_span('upload-encodings-to-db', child_of=parent_span):
            log.debug("Adding encodings and associated blocks to db")
            try:
                store_encodings_in_db(conn, dp_id, encoding_generator, size)
            except Exception as e:
                update_dataprovider_uploaded_state(conn, project_id, dp_id, 'error')
                log.warning(e)

        with opentracing.tracer.start_span('update-encoding-metadata', child_of=parent_span):
            update_encoding_metadata(conn, None, dp_id, 'ready')
            update_blocks_state(conn, dp_id, block_sizes.keys(), 'ready')

    # # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id):
        logger.info("All parties data present. Scheduling any queued runs")
        check_for_executable_runs.delay(project_id, serialize_span(parent_span))
Ejemplo n.º 21
0
def compute_filter_similarity(chunk_info,
                              project_id,
                              run_id,
                              threshold,
                              encoding_size,
                              parent_span=None):
    """Compute filter similarity between a chunk of filters in dataprovider 1,
    and a chunk of filters in dataprovider 2.

    :param dict chunk_info:
        A chunk returned by ``anonlink.concurrency.split_to_chunks``.
    :param project_id:
    :param run_id:
    :param threshold:
    :param encoding_size: The size in bytes of each encoded entry
    :param parent_span: A serialized opentracing span context.
    :returns A 3-tuple: (num_results, result size in bytes, results_filename_in_object_store, )
    """
    log = logger.bind(pid=project_id, run_id=run_id)
    task_span = compute_filter_similarity.span

    def new_child_span(name, parent_scope=None):
        if parent_scope is None:
            parent_scope = compute_filter_similarity
        return compute_filter_similarity.tracer.start_active_span(
            name, child_of=parent_scope.span)

    log.debug("Computing similarity for a chunk of filters")
    log.debug(
        "Checking that the resource exists (in case of run being canceled/deleted)"
    )
    assert_valid_run(project_id, run_id, log)

    chunk_info_dp1, chunk_info_dp2 = chunk_info

    with DBConn() as conn:
        with new_child_span('fetching-encodings') as parent_scope:
            with new_child_span('fetching-left-encodings', parent_scope):
                log.debug(
                    "Fetching and deserializing chunk of filters for dataprovider 1"
                )
                chunk_with_ids_dp1, chunk_dp1_size = get_encoding_chunk(
                    conn, chunk_info_dp1, encoding_size)
                entity_ids_dp1, chunk_dp1 = zip(*chunk_with_ids_dp1)

            with new_child_span('fetching-right-encodings', parent_scope):
                log.debug(
                    "Fetching and deserializing chunk of filters for dataprovider 2"
                )
                chunk_with_ids_dp2, chunk_dp2_size = get_encoding_chunk(
                    conn, chunk_info_dp2, encoding_size)
                entity_ids_dp2, chunk_dp2 = zip(*chunk_with_ids_dp2)

    log.debug('Both chunks are fetched and deserialized')
    task_span.log_kv({
        'size1': chunk_dp1_size,
        'size2': chunk_dp2_size,
        'chunk_info': chunk_info
    })

    assert chunk_dp1_size > 0, "Zero sized chunk in dp1"
    assert chunk_dp2_size > 0, "Zero sized chunk in dp2"

    with new_child_span('comparing-encodings') as parent_scope:

        log.debug("Calculating filter similarity")
        with new_child_span('dice-call', parent_scope):
            try:
                sims, (rec_is0, rec_is1
                       ) = anonlink.similarities.dice_coefficient_accelerated(
                           datasets=(chunk_dp1, chunk_dp2),
                           threshold=threshold,
                           k=min(chunk_dp1_size, chunk_dp2_size))
            except NotImplementedError as e:
                log.warning("Encodings couldn't be compared using anonlink.")
                return

        with new_child_span('reindex-call', parent_scope):

            def reindex_using_encoding_ids(recordarray, encoding_id_list):
                # Map results from "index in chunk" to encoding id.
                return array.array('I',
                                   [encoding_id_list[i] for i in recordarray])

            rec_is0 = reindex_using_encoding_ids(rec_is0, entity_ids_dp1)
            rec_is1 = reindex_using_encoding_ids(rec_is1, entity_ids_dp2)

    log.debug('Encoding similarities calculated')

    with new_child_span('update-comparison-progress') as scope:
        # Update the number of comparisons completed
        comparisons_computed = chunk_dp1_size * chunk_dp2_size
        save_current_progress(comparisons_computed, run_id)
        scope.span.log_kv({
            'comparisons': comparisons_computed,
            'num_similar': len(sims)
        })
        log.debug("Comparisons: {}, Links above threshold: {}".format(
            comparisons_computed, len(sims)))

    with new_child_span('save-comparison-results-to-minio'):
        num_results = len(sims)

        if num_results:
            result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format(
                generate_code(12))
            task_span.log_kv({"edges": num_results})
            log.info("Writing {} intermediate results to file: {}".format(
                num_results, result_filename))

            # Make index arrays for serialization
            index_1 = array.array(
                'I', (chunk_info_dp1["datasetIndex"], )) * num_results
            index_2 = array.array(
                'I', (chunk_info_dp2["datasetIndex"], )) * num_results

            chunk_results = sims, (index_1, index_2), (rec_is0, rec_is1),

            bytes_iter, file_size \
                = anonlink.serialization.dump_candidate_pairs_iter(chunk_results)
            iter_stream = iterable_to_stream(bytes_iter)

            mc = connect_to_object_store()
            try:
                mc.put_object(Config.MINIO_BUCKET, result_filename,
                              iter_stream, file_size)
            except minio.ResponseError as err:
                log.warning("Failed to store result in minio")
                raise
        else:
            result_filename = None
            file_size = None

    return num_results, file_size, result_filename
Ejemplo n.º 22
0
def pull_external_data(project_id, dp_id,
                                      encoding_object_info, encoding_credentials,
                                      blocks_object_info, blocks_credentials,
                                      receipt_token, parent_span=None):
    """
    Load encoding and blocking data from object store.

    - pull blocking map into memory, create blocks in db
    - stream encodings into DB and add encoding + blocks from in memory dict.

    """
    log = logger.bind(pid=project_id, dp_id=dp_id)
    with DBConn() as conn:
        if not check_project_exists(conn, project_id):
            log.info("Project deleted, stopping immediately")
            return

        mc = connect_to_object_store(parse_minio_credentials(blocks_credentials))

    log.debug("Pulling blocking information from object store")
    response = mc.get_object(bucket_name=blocks_object_info['bucket'], object_name=blocks_object_info['path'])
    encoding_to_block_map = json.load(response)

    log.debug("Counting the blocks")
    block_sizes = {}
    for encoding_id in encoding_to_block_map:
        _blocks = encoding_to_block_map[encoding_id]
        for block_id in _blocks:
            block_id = str(block_id)
            block_sizes[block_id] = block_sizes.setdefault(block_id, 0) + 1

    block_count = len(block_sizes)
    log.debug(f"Processing {block_count} blocks")

    # stream the encodings
    bucket_name = encoding_object_info['bucket']
    object_name = encoding_object_info['path']

    stat, encodings_stream = stat_and_stream_object(bucket_name, object_name, parse_minio_credentials(encoding_credentials))
    count = int(stat.metadata['X-Amz-Meta-Hash-Count'])
    size = int(stat.metadata['X-Amz-Meta-Hash-Size'])
    log.debug(f"Processing {count} encodings of size {size}")
    assert count == len(encoding_to_block_map), f"Expected {count} encodings in blocks got {len(encoding_to_block_map)}"

    with DBConn() as conn:
        with opentracing.tracer.start_span('update-metadata-db', child_of=parent_span):
            insert_encoding_metadata(conn, None, dp_id, receipt_token, encoding_count=count, block_count=block_count)
            update_encoding_metadata_set_encoding_size(conn, dp_id, size)
        with opentracing.tracer.start_span('create-block-entries-in-db', child_of=parent_span):
            log.debug("Adding blocks to db")
            insert_blocking_metadata(conn, dp_id, block_sizes)

        def encoding_iterator(encoding_stream):
            binary_formatter = binary_format(size)
            for encoding_id in range(count):
                yield (
                    str(encoding_id),
                    binary_formatter.pack(encoding_id, encoding_stream.read(size)),
                    encoding_to_block_map[str(encoding_id)]
                    )

        with opentracing.tracer.start_span('upload-encodings-to-db', child_of=parent_span):
            log.debug("Adding encodings and associated blocks to db")
            try:
                store_encodings_in_db(conn, dp_id, encoding_iterator(encodings_stream), size)
            except Exception as e:
                update_dataprovider_uploaded_state(conn, project_id, dp_id, 'error')
                log.warning(e)

        with opentracing.tracer.start_span('update-encoding-metadata', child_of=parent_span):
            update_encoding_metadata(conn, None, dp_id, 'ready')
            update_blocks_state(conn, dp_id, block_sizes.keys(), 'ready')
Ejemplo n.º 23
0
def compute_filter_similarity(chunk_info,
                              project_id,
                              run_id,
                              threshold,
                              encoding_size,
                              parent_span=None):
    """Compute filter similarity between a chunk of filters in dataprovider 1,
    and a chunk of filters in dataprovider 2.

    :param chunk_info:
        Chunk info returned by ``anonlink.concurrency.split_to_chunks``.
        Additionally, "storeFilename" is added to each dataset chunk.
    :param project_id:
    :param threshold:
    :param encoding_size: The size in bytes of each encoded entry
    :param parent_span: A serialized opentracing span context.
    @returns A 2-tuple: (num_results, results_filename_in_object_store)
    """
    log = logger.bind(pid=project_id, run_id=run_id)
    log.debug("Computing similarity for a chunk of filters")
    span = compute_filter_similarity.span
    log.debug(
        "Checking that the resource exists (in case of job being canceled)")
    with DBConn() as db:
        if not check_project_exists(db, project_id) or not check_run_exists(
                db, project_id, run_id):
            log.info("Failing task as project or run not found in database.")
            raise DBResourceMissing("project or run not found in database")

    chunk_info_dp1, chunk_info_dp2 = chunk_info

    t0 = time.time()
    log.debug("Fetching and deserializing chunk of filters for dataprovider 1")
    chunk_dp1, chunk_dp1_size = get_chunk_from_object_store(
        chunk_info_dp1, encoding_size)

    t1 = time.time()
    log.debug("Fetching and deserializing chunk of filters for dataprovider 2")
    chunk_dp2, chunk_dp2_size = get_chunk_from_object_store(
        chunk_info_dp2, encoding_size)
    t2 = time.time()
    span.log_kv({'event': 'chunks are fetched and deserialized'})
    log.debug("Calculating filter similarity")
    span.log_kv({'size1': chunk_dp1_size, 'size2': chunk_dp2_size})
    chunk_results = anonlink.concurrency.process_chunk(
        chunk_info, (chunk_dp1, chunk_dp2),
        anonlink.similarities.dice_coefficient_accelerated,
        threshold,
        k=min(chunk_dp1_size, chunk_dp2_size))
    t3 = time.time()
    span.log_kv({'event': 'similarities calculated'})

    # Update the number of comparisons completed
    comparisons_computed = chunk_dp1_size * chunk_dp2_size
    save_current_progress(comparisons_computed, run_id)

    t4 = time.time()

    sims, _, _ = chunk_results
    num_results = len(sims)

    if num_results:
        result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format(
            generate_code(12))
        log.info("Writing {} intermediate results to file: {}".format(
            num_results, result_filename))

        bytes_iter, file_size \
            = anonlink.serialization.dump_candidate_pairs_iter(chunk_results)
        iter_stream = iterable_to_stream(bytes_iter)

        mc = connect_to_object_store()
        try:
            mc.put_object(Config.MINIO_BUCKET, result_filename, iter_stream,
                          file_size)
        except minio.ResponseError as err:
            log.warning("Failed to store result in minio")
            raise
    else:
        result_filename = None
        file_size = None
    t5 = time.time()

    log.info("run={} Comparisons: {}, Links above threshold: {}".format(
        run_id, comparisons_computed, len(chunk_results)))
    log.info(
        "Prep: {:.3f} + {:.3f}, Solve: {:.3f}, Progress: {:.3f}, Save: {:.3f}, Total: {:.3f}"
        .format(t1 - t0, t2 - t1, t3 - t2, t4 - t3, t5 - t4, t5 - t0))
    return num_results, file_size, result_filename
Ejemplo n.º 24
0
def compute_filter_similarity(chunk_info_dp1, chunk_info_dp2, project_id, run_id, threshold, encoding_size, parent_span=None):
    """Compute filter similarity between a chunk of filters in dataprovider 1,
    and a chunk of filters in dataprovider 2.

    :param chunk_info_dp1:
        A tuple containing:
            - object store filename
            - Chunk start index
            - Chunk stop index
    :param chunk_info_dp2:
    :param project_id:
    :param threshold:
    :param encoding_size: The size in bytes of each encoded entry
    :param parent_span: A serialized opentracing span context.
    """
    log = logger.bind(pid=project_id, run_id=run_id)
    log.debug("Computing similarity for a chunk of filters")
    span = compute_filter_similarity.span
    log.debug("Checking that the resource exists (in case of job being canceled)")
    with DBConn() as db:
        if not check_project_exists(db, project_id) or not check_run_exists(db, project_id, run_id):
            log.info("Stopping as project or run not found in database.")
            return None

    t0 = time.time()
    log.debug("Fetching and deserializing chunk of filters for dataprovider 1")
    chunk_dp1, chunk_dp1_size = get_chunk_from_object_store(chunk_info_dp1, encoding_size)

    t1 = time.time()
    log.debug("Fetching and deserializing chunk of filters for dataprovider 2")
    chunk_dp2, chunk_dp2_size = get_chunk_from_object_store(chunk_info_dp2, encoding_size)
    t2 = time.time()
    span.log_kv({'event': 'chunks are fetched and deserialized'})
    log.debug("Calculating filter similarity")
    span.log_kv({'size1': chunk_dp1_size, 'size2': chunk_dp2_size})
    chunk_results = anonlink.entitymatch.calculate_filter_similarity(chunk_dp1, chunk_dp2,
                                                                     threshold=threshold,
                                                                     k=min(chunk_dp1_size, chunk_dp2_size),
                                                                     use_python=False)
    t3 = time.time()
    span.log_kv({'event': 'similarities calculated'})

    # Update the number of comparisons completed
    comparisons_computed = chunk_dp1_size * chunk_dp2_size
    save_current_progress(comparisons_computed, run_id)

    t4 = time.time()

    partial_sparse_result = []
    # offset chunk's index
    offset_dp1 = chunk_info_dp1[1]
    offset_dp2 = chunk_info_dp2[1]

    log.debug("Offset DP1 by: {}, DP2 by: {}".format(offset_dp1, offset_dp2))
    for (ia, score, ib) in chunk_results:
        partial_sparse_result.append((ia + offset_dp1, ib + offset_dp2, score))

    t5 = time.time()

    num_results = len(partial_sparse_result)
    if num_results > 0:
        result_filename = 'chunk-res-{}.csv'.format(generate_code(12))
        log.info("Writing {} intermediate results to file: {}".format(num_results, result_filename))

        with open(result_filename, 'wt') as f:
            csvwriter = csv.writer(f)
            csvwriter.writerows(partial_sparse_result)

        # Now write these to the object store. and return the filename and summary
        # Will write a csv file for now
        mc = connect_to_object_store()
        try:
            mc.fput_object(Config.MINIO_BUCKET, result_filename, result_filename)
        except minio.ResponseError as err:
            log.warning("Failed to store result in minio")
            raise

        # If we don't delete the file we *do* run out of space
        os.remove(result_filename)
    else:
        result_filename = None
    t6 = time.time()

    log.info("run={} Comparisons: {}, Links above threshold: {}".format(run_id, comparisons_computed, len(chunk_results)))
    log.info("Prep: {:.3f} + {:.3f}, Solve: {:.3f}, Progress: {:.3f}, Offset: {:.3f}, Save: {:.3f}, Total: {:.3f}".format(
        t1 - t0,
        t2 - t1,
        t3 - t2,
        t4 - t3,
        t4 - t4,
        t6 - t5,
        t6 - t0)
    )
    return num_results, result_filename