Esempio n. 1
0
def prerun_check(project_id, run_id, parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    log.debug("Sanity check that we need to compute run")

    with DBConn() as conn:
        if not check_project_exists(conn, project_id):
            log.debug("Project not found. Skipping")
            raise ProjectDeleted(project_id)

        res = get_run(conn, run_id)
        if res is None:
            log.debug(f"Run not found. Skipping")
            raise RunDeleted(run_id)

        try:
            state = get_run_state_for_update(conn, run_id)
        except psycopg2.OperationalError:
            log.warning("Run started in another task. Skipping this race.")
            return

        if state in {'running', 'completed', 'error'}:
            log.warning("Run already started. Skipping")
            return

        log.debug("Setting run as in progress")
        update_run_set_started(conn, run_id)

        log.debug("Getting dp ids for compute similarity task")
        dp_ids = get_dataprovider_ids(conn, project_id)
        log.debug("Data providers: {}".format(dp_ids))

    create_comparison_jobs.delay(project_id, run_id,
                                 prerun_check.get_serialized_span())
    log.info("CLK similarity computation scheduled")
Esempio n. 2
0
def create_comparison_jobs(project_id, run_id, parent_span=None):
    """Schedule all the entity comparisons as sub tasks for a run.

    At a high level this task:
    - checks if the project and run have been deleted and if so aborts.
    - retrieves metadata: the number and size of the datasets, the encoding size,
      and the number and size of blocks.
    - splits the work into independent "chunks" and schedules them to run in celery
    - schedules the follow up task to run after all the comparisons have been computed.
    """
    log = logger.bind(pid=project_id, run_id=run_id)
    current_span = create_comparison_jobs.span
    with DBConn() as conn:
        check_run_active(conn, project_id, run_id)

        dp_ids = get_dataprovider_ids(conn, project_id)
        number_of_datasets = len(dp_ids)
        assert number_of_datasets >= 2, "Expected at least 2 data providers"
        log.info(f"Scheduling comparison of CLKs from data provider ids: "
                 f"{', '.join(map(str, dp_ids))}")

        # Retrieve required metadata
        dataset_sizes, dp_block_sizes = _retrieve_blocked_dataset_sizes(
            conn, project_id, dp_ids)

        log.info("Finding blocks in common between dataproviders")
        common_blocks = _get_common_blocks(dp_block_sizes, dp_ids)

        # We pass the encoding_size and threshold to the comparison tasks to minimize their db lookups
        encoding_size = get_project_encoding_size(conn, project_id)
        threshold = get_run(conn, run_id)['threshold']

    log.debug("Chunking computation task")
    # Create "chunks" of comparisons
    chunks = _create_work_chunks(common_blocks, dp_block_sizes, dp_ids, log)

    log.info(f"Chunking into {len(chunks)} computation tasks")
    current_span.log_kv({
        "event": "chunking",
        'num_chunks': len(chunks),
        'dataset-sizes': dataset_sizes
    })
    span_serialized = create_comparison_jobs.get_serialized_span()

    # Prepare the Celery Chord that will compute all the similarity scores:
    scoring_tasks = [
        compute_filter_similarity.si(chunk_info, project_id, run_id, threshold,
                                     encoding_size, span_serialized)
        for chunk_info in chunks
    ]

    if len(scoring_tasks) == 1:
        scoring_tasks.append(celery_bug_fix.si())

    callback_task = aggregate_comparisons.s(
        project_id=project_id, run_id=run_id,
        parent_span=span_serialized).on_error(
            run_failed_handler.s(run_id=run_id))
    log.info(f"Scheduling comparison tasks")
    future = chord(scoring_tasks)(callback_task)
def get(project_id, run_id):
    log = logger.bind(pid=project_id, rid=run_id)
    log.info("request description of a run")
    authorize_run_detail(project_id, run_id)
    log.debug("request for run description authorized")

    with db.DBConn() as conn:
        log.debug("Retrieving run description from database")
        run_object = db.get_run(conn, run_id)

    return RunDescription().dump(run_object)
Esempio n. 4
0
def get(project_id, run_id):
    log = logger.bind(pid=project_id, rid=run_id)
    logger.info("request description of a run")
    # Check the project and run resources exist
    abort_if_run_doesnt_exist(project_id, run_id)

    # Check the caller has a valid results token. Yes it should be renamed.
    abort_if_invalid_results_token(project_id,
                                   request.headers.get('Authorization'))

    log.info("request for run description authorized")

    db_conn = db.get_db()
    run_object = db.get_run(db_conn, run_id)

    return RunDescription().dump(run_object)
Esempio n. 5
0
def prerun_check(project_id, run_id, parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    log.debug("Sanity check that we need to compute run")

    # being very defensive here checking if the run state is already in the redis cache
    if not is_run_missing(run_id):
        log.warning(
            "unexpectedly the run state is present in redis before starting")
        return

    with DBConn() as conn:
        if not check_project_exists(conn, project_id):
            log.debug("Project not found. Skipping")
            raise ProjectDeleted(project_id)

        res = get_run(conn, run_id)
        if res is None:
            log.debug(f"Run not found. Skipping")
            raise RunDeleted(run_id)

        try:
            db_state = get_run_state_for_update(conn, run_id)
        except psycopg2.OperationalError:
            log.warning("Run started in another task. Skipping this race.")
            return

        if db_state in {'running', 'completed', 'error'}:
            log.warning("Run already started. Skipping")
            return

        log.debug("Setting run state in db as 'running'")
        update_run_set_started(conn, run_id)

        log.debug("Updating redis cache for run")
        set_run_state_active(run_id)

    create_comparison_jobs.apply_async(kwargs={
        'project_id':
        project_id,
        'run_id':
        run_id,
        'parent_span':
        prerun_check.get_serialized_span()
    },
                                       link_error=run_failed_handler.s())
    log.info("CLK similarity computation scheduled")
def create_comparison_jobs(project_id, run_id, parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    with DBConn() as conn:

        dp_ids = get_dataprovider_ids(conn, project_id)
        assert len(dp_ids) >= 2, "Expected at least 2 data providers"
        log.info(f"Starting comparison of CLKs from data provider ids: "
                 f"{', '.join(map(str, dp_ids))}")
        current_span = create_comparison_jobs.span

        if not check_project_exists(conn, project_id) or not check_run_exists(
                conn, project_id, run_id):
            log.info("Skipping as project or run not found in database.")
            return

        run_info = get_run(conn, run_id)
        threshold = run_info['threshold']

        dataset_sizes = get_project_dataset_sizes(conn, project_id)

        if len(dataset_sizes) < 2:
            log.warning("Unexpected number of dataset sizes in db. Stopping")
            update_run_mark_failure(conn, run_id)
            return

        encoding_size = get_project_encoding_size(conn, project_id)

        log.info(f"Computing similarity for "
                 f"{' x '.join(map(str, dataset_sizes))} entities")
        current_span.log_kv({"event": 'get-dataset-sizes'})

        filters_object_filenames = tuple(
            get_filter_metadata(conn, dp_id) for dp_id in dp_ids)
        current_span.log_kv({"event": 'get-metadata'})

        log.debug("Chunking computation task")

    chunk_infos = tuple(
        anonlink.concurrency.split_to_chunks(Config.CHUNK_SIZE_AIM,
                                             dataset_sizes=dataset_sizes))

    # Save filenames with chunk information.
    for chunk_info in chunk_infos:
        for chunk_dp_info in chunk_info:
            chunk_dp_index = chunk_dp_info['datasetIndex']
            chunk_dp_store_filename = filters_object_filenames[chunk_dp_index]
            chunk_dp_info['storeFilename'] = chunk_dp_store_filename

    log.info(f"Chunking into {len(chunk_infos)} computation tasks")
    current_span.log_kv({"event": "chunking", 'num_chunks': len(chunk_infos)})
    span_serialized = create_comparison_jobs.get_serialized_span()

    # Prepare the Celery Chord that will compute all the similarity scores:
    scoring_tasks = [
        compute_filter_similarity.si(chunk_info, project_id, run_id, threshold,
                                     encoding_size, span_serialized)
        for chunk_info in chunk_infos
    ]

    if len(scoring_tasks) == 1:
        scoring_tasks.append(celery_bug_fix.si())

    callback_task = aggregate_comparisons.s(
        project_id, run_id,
        parent_span=span_serialized).on_error(on_chord_error.s(run_id=run_id))
    future = chord(scoring_tasks)(callback_task)
Esempio n. 7
0
def create_comparison_jobs(project_id, run_id, parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    with DBConn() as conn:

        dp_ids = get_dataprovider_ids(conn, project_id)
        assert len(dp_ids) >= 2, "Expected at least 2 data providers"
        log.info("Starting comparison of CLKs from data provider ids: {}, {}".format(dp_ids[0], dp_ids[1]))
        current_span = create_comparison_jobs.span

        if not check_project_exists(conn, project_id) or not check_run_exists(conn, project_id, run_id):
            log.info("Skipping as project or run not found in database.")
            return

        run_info = get_run(conn, run_id)
        threshold = run_info['threshold']

        dataset_sizes = get_project_dataset_sizes(conn, project_id)

        if len(dataset_sizes) < 2:
            log.warning("Unexpected number of dataset sizes in db. Stopping")
            update_run_mark_failure(conn, run_id)
            return
        else:
            lenf1, lenf2 = dataset_sizes

        encoding_size = get_project_encoding_size(conn, project_id)

        size = lenf1 * lenf2

        log.info("Computing similarity for {} x {} entities".format(lenf1, lenf2))
        current_span.log_kv({"event": 'get-dataset-sizes'})

        filters1_object_filename = get_filter_metadata(conn, dp_ids[0])
        filters2_object_filename = get_filter_metadata(conn, dp_ids[1])
        current_span.log_kv({"event": 'get-metadata'})

        log.debug("Chunking computation task")
        chunk_size = Config.get_task_chunk_size(size, threshold)
        if chunk_size is None:
            chunk_size = max(lenf1, lenf2)
        log.info("Chunks will contain {} entities per task".format(chunk_size))
        update_run_chunk(conn, project_id, chunk_size)
    job_chunks = []

    dp1_chunks = []
    dp2_chunks = []

    for chunk_start_index_dp1 in range(0, lenf1, chunk_size):
        dp1_chunks.append(
            (filters1_object_filename, chunk_start_index_dp1, min(chunk_start_index_dp1 + chunk_size, lenf1))
        )
    for chunk_start_index_dp2 in range(0, lenf2, chunk_size):
        dp2_chunks.append(
            (filters2_object_filename, chunk_start_index_dp2, min(chunk_start_index_dp2 + chunk_size, lenf2))
        )

    # Every chunk in dp1 has to be run against every chunk in dp2
    for dp1_chunk in dp1_chunks:
        for dp2_chunk in dp2_chunks:
            job_chunks.append((dp1_chunk, dp2_chunk, ))

    log.info("Chunking into {} computation tasks each with (at most) {} entities.".format(
        len(job_chunks), chunk_size))
    current_span.log_kv({"event": "chunking", "chunksize": chunk_size, 'num_chunks': len(job_chunks)})
    span_serialized = create_comparison_jobs.get_serialized_span()

    # Prepare the Celery Chord that will compute all the similarity scores:
    scoring_tasks = [compute_filter_similarity.si(
        chunk_dp1,
        chunk_dp2,
        project_id,
        run_id,
        threshold,
        encoding_size,
        span_serialized
    ) for chunk_dp1, chunk_dp2 in job_chunks]

    if len(scoring_tasks) == 1:
        scoring_tasks.append(celery_bug_fix.si())

    callback_task = aggregate_comparisons.s(project_id, run_id, parent_span=span_serialized).on_error(
        on_chord_error.s(run_id=run_id))
    future = chord(scoring_tasks)(callback_task)