def abort_if_run_doesnt_exist(project_id, run_id):
    with DBConn() as conn:
        resource_exists = db.check_run_exists(conn, project_id, run_id)
    if not resource_exists:
        logger.info(
            "Requested project or run resource with invalid identifier token")
        safe_fail_request(403, message=INVALID_ACCESS_MSG)
def assert_valid_run(project_id, run_id, log):
    if not is_run_active(run_id):
        raise InactiveRun("Run isn't marked as active")

    with DBConn() as db:
        if not check_project_exists(db, project_id) or not check_run_exists(
                db, project_id, run_id):
            log.info("Project or run not found in database.")
            raise DBResourceMissing("project or run not found in database")
Example #3
0
def check_run_active(conn, project_id, run_id):
    """Raises InactiveRun if the project or run has been deleted from the database.
    """
    if not check_project_exists(conn, project_id) or not check_run_exists(
            conn, project_id, run_id):
        raise InactiveRun("Skipping as project or run not found in database.")
def create_comparison_jobs(project_id, run_id, parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    with DBConn() as conn:

        dp_ids = get_dataprovider_ids(conn, project_id)
        assert len(dp_ids) >= 2, "Expected at least 2 data providers"
        log.info(f"Starting comparison of CLKs from data provider ids: "
                 f"{', '.join(map(str, dp_ids))}")
        current_span = create_comparison_jobs.span

        if not check_project_exists(conn, project_id) or not check_run_exists(
                conn, project_id, run_id):
            log.info("Skipping as project or run not found in database.")
            return

        run_info = get_run(conn, run_id)
        threshold = run_info['threshold']

        dataset_sizes = get_project_dataset_sizes(conn, project_id)

        if len(dataset_sizes) < 2:
            log.warning("Unexpected number of dataset sizes in db. Stopping")
            update_run_mark_failure(conn, run_id)
            return

        encoding_size = get_project_encoding_size(conn, project_id)

        log.info(f"Computing similarity for "
                 f"{' x '.join(map(str, dataset_sizes))} entities")
        current_span.log_kv({"event": 'get-dataset-sizes'})

        filters_object_filenames = tuple(
            get_filter_metadata(conn, dp_id) for dp_id in dp_ids)
        current_span.log_kv({"event": 'get-metadata'})

        log.debug("Chunking computation task")

    chunk_infos = tuple(
        anonlink.concurrency.split_to_chunks(Config.CHUNK_SIZE_AIM,
                                             dataset_sizes=dataset_sizes))

    # Save filenames with chunk information.
    for chunk_info in chunk_infos:
        for chunk_dp_info in chunk_info:
            chunk_dp_index = chunk_dp_info['datasetIndex']
            chunk_dp_store_filename = filters_object_filenames[chunk_dp_index]
            chunk_dp_info['storeFilename'] = chunk_dp_store_filename

    log.info(f"Chunking into {len(chunk_infos)} computation tasks")
    current_span.log_kv({"event": "chunking", 'num_chunks': len(chunk_infos)})
    span_serialized = create_comparison_jobs.get_serialized_span()

    # Prepare the Celery Chord that will compute all the similarity scores:
    scoring_tasks = [
        compute_filter_similarity.si(chunk_info, project_id, run_id, threshold,
                                     encoding_size, span_serialized)
        for chunk_info in chunk_infos
    ]

    if len(scoring_tasks) == 1:
        scoring_tasks.append(celery_bug_fix.si())

    callback_task = aggregate_comparisons.s(
        project_id, run_id,
        parent_span=span_serialized).on_error(on_chord_error.s(run_id=run_id))
    future = chord(scoring_tasks)(callback_task)
def compute_filter_similarity(chunk_info,
                              project_id,
                              run_id,
                              threshold,
                              encoding_size,
                              parent_span=None):
    """Compute filter similarity between a chunk of filters in dataprovider 1,
    and a chunk of filters in dataprovider 2.

    :param chunk_info:
        Chunk info returned by ``anonlink.concurrency.split_to_chunks``.
        Additionally, "storeFilename" is added to each dataset chunk.
    :param project_id:
    :param threshold:
    :param encoding_size: The size in bytes of each encoded entry
    :param parent_span: A serialized opentracing span context.
    @returns A 2-tuple: (num_results, results_filename_in_object_store)
    """
    log = logger.bind(pid=project_id, run_id=run_id)
    log.debug("Computing similarity for a chunk of filters")
    span = compute_filter_similarity.span
    log.debug(
        "Checking that the resource exists (in case of job being canceled)")
    with DBConn() as db:
        if not check_project_exists(db, project_id) or not check_run_exists(
                db, project_id, run_id):
            log.info("Failing task as project or run not found in database.")
            raise DBResourceMissing("project or run not found in database")

    chunk_info_dp1, chunk_info_dp2 = chunk_info

    t0 = time.time()
    log.debug("Fetching and deserializing chunk of filters for dataprovider 1")
    chunk_dp1, chunk_dp1_size = get_chunk_from_object_store(
        chunk_info_dp1, encoding_size)

    t1 = time.time()
    log.debug("Fetching and deserializing chunk of filters for dataprovider 2")
    chunk_dp2, chunk_dp2_size = get_chunk_from_object_store(
        chunk_info_dp2, encoding_size)
    t2 = time.time()
    span.log_kv({'event': 'chunks are fetched and deserialized'})
    log.debug("Calculating filter similarity")
    span.log_kv({'size1': chunk_dp1_size, 'size2': chunk_dp2_size})
    chunk_results = anonlink.concurrency.process_chunk(
        chunk_info, (chunk_dp1, chunk_dp2),
        anonlink.similarities.dice_coefficient_accelerated,
        threshold,
        k=min(chunk_dp1_size, chunk_dp2_size))
    t3 = time.time()
    span.log_kv({'event': 'similarities calculated'})

    # Update the number of comparisons completed
    comparisons_computed = chunk_dp1_size * chunk_dp2_size
    save_current_progress(comparisons_computed, run_id)

    t4 = time.time()

    sims, _, _ = chunk_results
    num_results = len(sims)

    if num_results:
        result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format(
            generate_code(12))
        log.info("Writing {} intermediate results to file: {}".format(
            num_results, result_filename))

        bytes_iter, file_size \
            = anonlink.serialization.dump_candidate_pairs_iter(chunk_results)
        iter_stream = iterable_to_stream(bytes_iter)

        mc = connect_to_object_store()
        try:
            mc.put_object(Config.MINIO_BUCKET, result_filename, iter_stream,
                          file_size)
        except minio.ResponseError as err:
            log.warning("Failed to store result in minio")
            raise
    else:
        result_filename = None
        file_size = None
    t5 = time.time()

    log.info("run={} Comparisons: {}, Links above threshold: {}".format(
        run_id, comparisons_computed, len(chunk_results)))
    log.info(
        "Prep: {:.3f} + {:.3f}, Solve: {:.3f}, Progress: {:.3f}, Save: {:.3f}, Total: {:.3f}"
        .format(t1 - t0, t2 - t1, t3 - t2, t4 - t3, t5 - t4, t5 - t0))
    return num_results, file_size, result_filename
Example #6
0
def create_comparison_jobs(project_id, run_id, parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    with DBConn() as conn:

        dp_ids = get_dataprovider_ids(conn, project_id)
        assert len(dp_ids) >= 2, "Expected at least 2 data providers"
        log.info("Starting comparison of CLKs from data provider ids: {}, {}".format(dp_ids[0], dp_ids[1]))
        current_span = create_comparison_jobs.span

        if not check_project_exists(conn, project_id) or not check_run_exists(conn, project_id, run_id):
            log.info("Skipping as project or run not found in database.")
            return

        run_info = get_run(conn, run_id)
        threshold = run_info['threshold']

        dataset_sizes = get_project_dataset_sizes(conn, project_id)

        if len(dataset_sizes) < 2:
            log.warning("Unexpected number of dataset sizes in db. Stopping")
            update_run_mark_failure(conn, run_id)
            return
        else:
            lenf1, lenf2 = dataset_sizes

        encoding_size = get_project_encoding_size(conn, project_id)

        size = lenf1 * lenf2

        log.info("Computing similarity for {} x {} entities".format(lenf1, lenf2))
        current_span.log_kv({"event": 'get-dataset-sizes'})

        filters1_object_filename = get_filter_metadata(conn, dp_ids[0])
        filters2_object_filename = get_filter_metadata(conn, dp_ids[1])
        current_span.log_kv({"event": 'get-metadata'})

        log.debug("Chunking computation task")
        chunk_size = Config.get_task_chunk_size(size, threshold)
        if chunk_size is None:
            chunk_size = max(lenf1, lenf2)
        log.info("Chunks will contain {} entities per task".format(chunk_size))
        update_run_chunk(conn, project_id, chunk_size)
    job_chunks = []

    dp1_chunks = []
    dp2_chunks = []

    for chunk_start_index_dp1 in range(0, lenf1, chunk_size):
        dp1_chunks.append(
            (filters1_object_filename, chunk_start_index_dp1, min(chunk_start_index_dp1 + chunk_size, lenf1))
        )
    for chunk_start_index_dp2 in range(0, lenf2, chunk_size):
        dp2_chunks.append(
            (filters2_object_filename, chunk_start_index_dp2, min(chunk_start_index_dp2 + chunk_size, lenf2))
        )

    # Every chunk in dp1 has to be run against every chunk in dp2
    for dp1_chunk in dp1_chunks:
        for dp2_chunk in dp2_chunks:
            job_chunks.append((dp1_chunk, dp2_chunk, ))

    log.info("Chunking into {} computation tasks each with (at most) {} entities.".format(
        len(job_chunks), chunk_size))
    current_span.log_kv({"event": "chunking", "chunksize": chunk_size, 'num_chunks': len(job_chunks)})
    span_serialized = create_comparison_jobs.get_serialized_span()

    # Prepare the Celery Chord that will compute all the similarity scores:
    scoring_tasks = [compute_filter_similarity.si(
        chunk_dp1,
        chunk_dp2,
        project_id,
        run_id,
        threshold,
        encoding_size,
        span_serialized
    ) for chunk_dp1, chunk_dp2 in job_chunks]

    if len(scoring_tasks) == 1:
        scoring_tasks.append(celery_bug_fix.si())

    callback_task = aggregate_comparisons.s(project_id, run_id, parent_span=span_serialized).on_error(
        on_chord_error.s(run_id=run_id))
    future = chord(scoring_tasks)(callback_task)
Example #7
0
def compute_filter_similarity(chunk_info_dp1, chunk_info_dp2, project_id, run_id, threshold, encoding_size, parent_span=None):
    """Compute filter similarity between a chunk of filters in dataprovider 1,
    and a chunk of filters in dataprovider 2.

    :param chunk_info_dp1:
        A tuple containing:
            - object store filename
            - Chunk start index
            - Chunk stop index
    :param chunk_info_dp2:
    :param project_id:
    :param threshold:
    :param encoding_size: The size in bytes of each encoded entry
    :param parent_span: A serialized opentracing span context.
    """
    log = logger.bind(pid=project_id, run_id=run_id)
    log.debug("Computing similarity for a chunk of filters")
    span = compute_filter_similarity.span
    log.debug("Checking that the resource exists (in case of job being canceled)")
    with DBConn() as db:
        if not check_project_exists(db, project_id) or not check_run_exists(db, project_id, run_id):
            log.info("Stopping as project or run not found in database.")
            return None

    t0 = time.time()
    log.debug("Fetching and deserializing chunk of filters for dataprovider 1")
    chunk_dp1, chunk_dp1_size = get_chunk_from_object_store(chunk_info_dp1, encoding_size)

    t1 = time.time()
    log.debug("Fetching and deserializing chunk of filters for dataprovider 2")
    chunk_dp2, chunk_dp2_size = get_chunk_from_object_store(chunk_info_dp2, encoding_size)
    t2 = time.time()
    span.log_kv({'event': 'chunks are fetched and deserialized'})
    log.debug("Calculating filter similarity")
    span.log_kv({'size1': chunk_dp1_size, 'size2': chunk_dp2_size})
    chunk_results = anonlink.entitymatch.calculate_filter_similarity(chunk_dp1, chunk_dp2,
                                                                     threshold=threshold,
                                                                     k=min(chunk_dp1_size, chunk_dp2_size),
                                                                     use_python=False)
    t3 = time.time()
    span.log_kv({'event': 'similarities calculated'})

    # Update the number of comparisons completed
    comparisons_computed = chunk_dp1_size * chunk_dp2_size
    save_current_progress(comparisons_computed, run_id)

    t4 = time.time()

    partial_sparse_result = []
    # offset chunk's index
    offset_dp1 = chunk_info_dp1[1]
    offset_dp2 = chunk_info_dp2[1]

    log.debug("Offset DP1 by: {}, DP2 by: {}".format(offset_dp1, offset_dp2))
    for (ia, score, ib) in chunk_results:
        partial_sparse_result.append((ia + offset_dp1, ib + offset_dp2, score))

    t5 = time.time()

    num_results = len(partial_sparse_result)
    if num_results > 0:
        result_filename = 'chunk-res-{}.csv'.format(generate_code(12))
        log.info("Writing {} intermediate results to file: {}".format(num_results, result_filename))

        with open(result_filename, 'wt') as f:
            csvwriter = csv.writer(f)
            csvwriter.writerows(partial_sparse_result)

        # Now write these to the object store. and return the filename and summary
        # Will write a csv file for now
        mc = connect_to_object_store()
        try:
            mc.fput_object(Config.MINIO_BUCKET, result_filename, result_filename)
        except minio.ResponseError as err:
            log.warning("Failed to store result in minio")
            raise

        # If we don't delete the file we *do* run out of space
        os.remove(result_filename)
    else:
        result_filename = None
    t6 = time.time()

    log.info("run={} Comparisons: {}, Links above threshold: {}".format(run_id, comparisons_computed, len(chunk_results)))
    log.info("Prep: {:.3f} + {:.3f}, Solve: {:.3f}, Progress: {:.3f}, Offset: {:.3f}, Save: {:.3f}, Total: {:.3f}".format(
        t1 - t0,
        t2 - t1,
        t3 - t2,
        t4 - t3,
        t4 - t4,
        t6 - t5,
        t6 - t0)
    )
    return num_results, result_filename