Beispiel #1
0
def upload_clk_data_binary(project_id, dp_id, raw_stream, count, size=128):
    """
    Save the user provided raw CLK data.

    """
    receipt_token = generate_code()
    filename = Config.BIN_FILENAME_FMT.format(receipt_token)
    # Set the state to 'pending' in the bloomingdata table
    with DBConn() as conn:
        db.insert_encoding_metadata(conn, filename, dp_id, receipt_token,
                                    count)
        db.update_encoding_metadata_set_encoding_size(conn, dp_id, size)
    logger.info(
        f"Storing supplied binary clks of individual size {size} in file: {filename}"
    )

    num_bytes = count * (size + 6)

    logger.debug(
        "Directly storing binary file with index, base64 encoded CLK, popcount"
    )

    # Upload to object store
    logger.info(
        f"Uploading {count} binary encodings to object store. Total size: {fmt_bytes(num_bytes)}"
    )
    parent_span = g.flask_tracer.get_span()

    with opentracing.tracer.start_span('save-to-minio',
                                       child_of=parent_span) as span:
        mc = connect_to_object_store()
        try:
            mc.put_object(Config.MINIO_BUCKET,
                          filename,
                          data=raw_stream,
                          length=num_bytes)
        except (minio.error.InvalidSizeError, minio.error.InvalidArgumentError,
                minio.error.ResponseError):
            logger.info(
                "Mismatch between expected stream length and header info")
            raise ValueError(
                "Mismatch between expected stream length and header info")

    with opentracing.tracer.start_span('update-database',
                                       child_of=parent_span) as span:
        with DBConn() as conn:
            db.update_encoding_metadata(conn, filename, dp_id, 'ready')
            db.set_dataprovider_upload_state(conn, dp_id, True)

    # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id):
        logger.info("All parties data present. Scheduling any queued runs")
        check_for_executable_runs.delay(project_id,
                                        serialize_span(parent_span))

    return receipt_token
Beispiel #2
0
def prerun_check(project_id, run_id, parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    log.debug("Sanity check that we need to compute run")

    with DBConn() as conn:
        if not check_project_exists(conn, project_id):
            log.debug("Project not found. Skipping")
            raise ProjectDeleted(project_id)

        res = get_run(conn, run_id)
        if res is None:
            log.debug(f"Run not found. Skipping")
            raise RunDeleted(run_id)

        try:
            state = get_run_state_for_update(conn, run_id)
        except psycopg2.OperationalError:
            log.warning("Run started in another task. Skipping this race.")
            return

        if state in {'running', 'completed', 'error'}:
            log.warning("Run already started. Skipping")
            return

        log.debug("Setting run as in progress")
        update_run_set_started(conn, run_id)

        log.debug("Getting dp ids for compute similarity task")
        dp_ids = get_dataprovider_ids(conn, project_id)
        log.debug("Data providers: {}".format(dp_ids))

    create_comparison_jobs.delay(project_id, run_id,
                                 prerun_check.get_serialized_span())
    log.info("CLK similarity computation scheduled")
Beispiel #3
0
def create_comparison_jobs(project_id, run_id, parent_span=None):
    """Schedule all the entity comparisons as sub tasks for a run.

    At a high level this task:
    - checks if the project and run have been deleted and if so aborts.
    - retrieves metadata: the number and size of the datasets, the encoding size,
      and the number and size of blocks.
    - splits the work into independent "chunks" and schedules them to run in celery
    - schedules the follow up task to run after all the comparisons have been computed.
    """
    log = logger.bind(pid=project_id, run_id=run_id)
    current_span = create_comparison_jobs.span
    with DBConn() as conn:
        check_run_active(conn, project_id, run_id)

        dp_ids = get_dataprovider_ids(conn, project_id)
        number_of_datasets = len(dp_ids)
        assert number_of_datasets >= 2, "Expected at least 2 data providers"
        log.info(f"Scheduling comparison of CLKs from data provider ids: "
                 f"{', '.join(map(str, dp_ids))}")

        # Retrieve required metadata
        dataset_sizes, dp_block_sizes = _retrieve_blocked_dataset_sizes(
            conn, project_id, dp_ids)

        log.info("Finding blocks in common between dataproviders")
        common_blocks = _get_common_blocks(dp_block_sizes, dp_ids)

        # We pass the encoding_size and threshold to the comparison tasks to minimize their db lookups
        encoding_size = get_project_encoding_size(conn, project_id)
        threshold = get_run(conn, run_id)['threshold']

    log.debug("Chunking computation task")
    # Create "chunks" of comparisons
    chunks = _create_work_chunks(common_blocks, dp_block_sizes, dp_ids, log)

    log.info(f"Chunking into {len(chunks)} computation tasks")
    current_span.log_kv({
        "event": "chunking",
        'num_chunks': len(chunks),
        'dataset-sizes': dataset_sizes
    })
    span_serialized = create_comparison_jobs.get_serialized_span()

    # Prepare the Celery Chord that will compute all the similarity scores:
    scoring_tasks = [
        compute_filter_similarity.si(chunk_info, project_id, run_id, threshold,
                                     encoding_size, span_serialized)
        for chunk_info in chunks
    ]

    if len(scoring_tasks) == 1:
        scoring_tasks.append(celery_bug_fix.si())

    callback_task = aggregate_comparisons.s(
        project_id=project_id, run_id=run_id,
        parent_span=span_serialized).on_error(
            run_failed_handler.s(run_id=run_id))
    log.info(f"Scheduling comparison tasks")
    future = chord(scoring_tasks)(callback_task)
Beispiel #4
0
def save_and_permute(similarity_result, project_id, run_id, parent_span):
    log = logger.bind(pid=project_id, run_id=run_id)
    log.debug("Saving and possibly permuting data")
    mapping = similarity_result['mapping']

    # Note Postgres requires JSON object keys to be strings
    # Celery actually converts the json arguments in the same way

    with DBConn() as db:
        result_type = get_project_column(db, project_id, 'result_type')

        # Just save the raw "mapping"
        log.debug("Saving the resulting map data to the db")
        result_id = insert_mapping_result(db, run_id, mapping)
        dp_ids = get_dataprovider_ids(db, project_id)

    log.info("Mapping result saved to db with result id {}".format(result_id))

    if result_type == "permutations":
        log.debug("Submitting job to permute mapping")
        permute_mapping_data.apply_async(
            (project_id, run_id,
             similarity_result['lenf1'], similarity_result['lenf2'],
             save_and_permute.get_serialized_span()))
    else:
        log.debug("Mark mapping job as complete")
        mark_run_complete.delay(run_id, save_and_permute.get_serialized_span())

    # Post similarity computation cleanup
    log.debug("Removing clk filters from redis cache")

    for dp_id in dp_ids:
        cache.remove_from_cache(dp_id)
    calculate_comparison_rate.delay()
Beispiel #5
0
def projects_post(project):
    """Create a new project

    There are multiple result types, see documentation for how these effect information leakage
    and the resulting data.
    """
    logger.debug("Processing request to add a new project", project=project)
    try:
        project_model = models.Project.from_json(project)
    except models.InvalidProjectParametersException as e:
        logger.info(f"Denied request to add a new project - {e.msg}",
                    project=project)
        safe_fail_request(400, message=e.msg)

    # Persist the new project
    log = logger.bind(pid=project_model.project_id)
    log.info("Adding new project to database")
    try:
        with DBConn() as conn:
            project_model.save(conn)
    except Exception as e:
        log.warn(e)
        safe_fail_request(500, 'Problem creating new project')

    return NewProjectResponse().dump(project_model), 201
Beispiel #6
0
def check_for_executable_runs(project_id, parent_span=None):
    """
    This is called when a run is posted (if project is ready for runs), and also
    after all dataproviders have uploaded CLKs, and the CLKS are ready.
    """
    log = logger.bind(pid=project_id)
    log.debug("Checking for runs that need to be executed")
    if not clks_uploaded_to_project(project_id, check_data_ready=True):
        return

    with DBConn() as conn:
        try:
            check_and_set_project_encoding_size(project_id, conn)
        except ValueError as e:
            log.warning(e.args[0])
            # make sure this error can be exposed to user by marking the run/s as failed
            update_project_mark_all_runs_failed(conn, project_id)
            return
        new_runs = get_created_runs_and_queue(conn, project_id)

        log.debug("Progressing run stages")
        for qr in new_runs:
            # Record that the run has reached a new stage
            run_id = qr[0]
            progress_stage(conn, run_id)

    # commit db changes before scheduling following tasks
    log.debug("Creating tasks for {} created runs for project {}".format(len(new_runs), project_id))
    for qr in new_runs:
        run_id = qr[0]
        log.info('Queueing run for computation', run_id=run_id)
        prerun_check.delay(project_id, run_id, check_for_executable_runs.get_serialized_span())
Beispiel #7
0
def get_deserialized_filter(dp_id):
    """Cached, deserialized version.
    """
    logger.debug("Getting filters")
    key = 'clk-pkl-{}'.format(dp_id)
    r = connect_to_redis(read_only=True)

    # Check if this dp_id is already saved in redis?
    if r.exists(key):
        logger.debug("returning filters from cache")
        return pickle.loads(r.get(key))
    else:
        logger.debug("Looking up popcounts and filename from database")
        with DBConn() as db:
            serialized_filters_file, encoding_size = get_filter_metadata(
                db, dp_id)
        mc = connect_to_object_store()
        logger.debug("Getting filters from object store")

        # Note this uses already calculated popcounts unlike
        # serialization.deserialize_filters()
        raw_data_response = mc.get_object(config.MINIO_BUCKET,
                                          serialized_filters_file)
        python_filters = binary_unpack_filters(
            raw_data_response.stream(encoding_size))

        set_deserialized_filter(dp_id, python_filters)
        return python_filters
def abort_if_invalid_dataprovider_token(update_token):
    logger.debug("checking authorization token to upload data")
    with DBConn() as conn:
        resource_exists = db.check_update_auth(conn, update_token)
    if not resource_exists:
        logger.debug("authorization token invalid")
        safe_fail_request(403, message=INVALID_ACCESS_MSG)
def abort_if_run_doesnt_exist(project_id, run_id):
    with DBConn() as conn:
        resource_exists = db.check_run_exists(conn, project_id, run_id)
    if not resource_exists:
        logger.info(
            "Requested project or run resource with invalid identifier token")
        safe_fail_request(403, message=INVALID_ACCESS_MSG)
def get_authorization_token_type_or_abort(project_id, token):
    """
    In case of a permutation with an unencrypted mask, we are using both the result token and the receipt tokens.
    The result token reveals the mask. The receipts tokens are used by the dataproviders to get their permutations.
    However, we do not know the type of token we have before checking.
    """
    logger.debug("checking if provided authorization is a results_token")
    # If the token is not a valid result token, it should be a receipt token.
    if not is_results_token_valid(project_id, token):
        logger.debug("checking if provided authorization is receipt_token")
        # If the token is not a valid receipt token, we abort.
        if not is_receipt_token_valid(project_id, token):
            safe_fail_request(403, message=INVALID_ACCESS_MSG)
        token_type = 'receipt_token'
    else:
        token_type = 'result_token'

    # Note that at this stage we have EITHER a receipt or result token, and depending on the result_type
    # that might mean the caller is not authorized.
    with DBConn() as conn:
        result_type = get_project_column(conn, project_id, 'result_type')
    if result_type in {'groups', 'similarity_scores'
                       } and token_type == 'receipt_token':
        logger.info("Caller provided receipt token to get results")
        safe_fail_request(403, message=INVALID_ACCESS_MSG)
    return token_type
Beispiel #11
0
def calculate_comparison_rate():
    with DBConn() as dbinstance:
        logger.info("Calculating global comparison rate")

        total_comparisons = 0
        total_time = timedelta(0)
        for run in get_elapsed_run_times(dbinstance):

            comparisons = get_total_comparisons_for_project(dbinstance, run['project_id'])

            if comparisons != 'NA':
                total_comparisons += comparisons
            else:
                logger.debug("Skipping run as it hasn't completed")
            total_time += run['elapsed']

        if total_time.total_seconds() > 0:
            rate = total_comparisons/total_time.total_seconds()
            logger.info("Total comparisons: {}".format(total_comparisons))
            logger.info("Total time:        {}".format(total_time.total_seconds()))
            logger.info("Comparison rate:   {:.0f}".format(rate))

            with dbinstance.cursor() as cur:
                insert_comparison_rate(cur, rate)

        else:
            logger.warning("Can't compute comparison rate yet")
Beispiel #12
0
def mark_run_complete(run_id, parent_span=None):
    log = logger.bind(run_id=run_id)
    log.debug("Marking run complete")
    with DBConn() as db:
        update_run_mark_complete(db, run_id)
    calculate_comparison_rate.delay()
    log.info("Run marked as complete")
def mark_run_complete(run_id, parent_span=None):
    log = logger.bind(run_id=run_id)
    log.debug("Marking run complete")
    with DBConn() as db:
        update_run_mark_complete(db, run_id)
    set_run_state_complete(run_id)
    log.info("Run marked as complete")
Beispiel #14
0
def upload_clk_data_binary(project_id,
                           dp_id,
                           encoding_iter,
                           receipt_token,
                           count,
                           size=128):
    """
    Save the user provided binary-packed CLK data.

    """
    filename = None
    # Set the state to 'pending' in the uploads table
    with DBConn() as conn:
        db.insert_encoding_metadata(conn,
                                    filename,
                                    dp_id,
                                    receipt_token,
                                    encoding_count=count,
                                    block_count=1)
        db.update_encoding_metadata_set_encoding_size(conn, dp_id, size)
    num_bytes = binary_format(size).size * count

    logger.debug(
        "Directly storing binary file with index, base64 encoded CLK, popcount"
    )

    # Upload to database
    logger.info(
        f"Uploading {count} binary encodings to database. Total size: {fmt_bytes(num_bytes)}"
    )
    parent_span = g.flask_tracer.get_span()

    with DBConn() as conn:
        db.update_encoding_metadata_set_encoding_size(conn, dp_id, size)

        with opentracing.tracer.start_span('create-default-block-in-db',
                                           child_of=parent_span):
            db.insert_blocking_metadata(conn, dp_id, {DEFAULT_BLOCK_ID: count})

        with opentracing.tracer.start_span('upload-encodings-to-db',
                                           child_of=parent_span):
            store_encodings_in_db(conn, dp_id, encoding_iter, size)

        with opentracing.tracer.start_span('update-encoding-metadata',
                                           child_of=parent_span):
            db.update_encoding_metadata(conn, filename, dp_id, 'ready')
def dataprovider_id_if_authorize(resource_id, receipt_token):
    logger.debug("checking authorization token to fetch mask data")
    if not is_receipt_token_valid(resource_id, receipt_token):
        safe_fail_request(403, message=INVALID_ACCESS_MSG)

    with DBConn() as conn:
        dp_id = db.select_dataprovider_id(conn, resource_id, receipt_token)
    return dp_id
def assert_valid_run(project_id, run_id, log):
    if not is_run_active(run_id):
        raise InactiveRun("Run isn't marked as active")

    with DBConn() as db:
        if not check_project_exists(db, project_id) or not check_run_exists(
                db, project_id, run_id):
            log.info("Project or run not found in database.")
            raise DBResourceMissing("project or run not found in database")
Beispiel #17
0
    def __init__(self, project_id, threshold, name, notes):
        self.project_id = project_id
        self.name = name
        self.notes = notes
        self.threshold = threshold
        self.run_id = generate_code()
        logger.info("Created run id", rid=self.run_id)

        with DBConn() as conn:
            self.type = 'no_mapping' \
                if db.get_project_column(conn, project_id, 'result_type') == 'similarity_scores' \
                else 'default'
def clks_uploaded_to_project(project_id, check_data_ready=False):
    """ See if the given project has had all parties contribute data.
    """
    log = logger.bind(pid=project_id)
    log.debug("Counting contributing parties")
    with DBConn() as conn:
        if check_data_ready:
            parties_contributed = get_number_parties_ready(conn, project_id)
            log.info("Parties where data is ready: {}".format(parties_contributed))
        else:
            parties_contributed = get_number_parties_uploaded(conn, project_id)
            log.info("Parties where data is uploaded: {}".format(parties_contributed))
        number_parties = get_project_column(conn, project_id, 'parties')
    log.info("{}/{} parties have contributed clks".format(parties_contributed, number_parties))
    return parties_contributed == number_parties
def run_failed_handler(*args, **kwargs):
    """
    Record that a task has encountered an error, mark the run as failed.

    :param args: A 1-tuple starting with the result id.
    :param kwargs: Keyword arguments to the task e.g. {'run_id': '...', }
    """
    task_id = args[0]
    if 'run_id' in kwargs:
        logger.bind(run_id=kwargs['run_id'])
    logger.info("An error occurred while processing task", task_id=task_id)

    with DBConn() as db:
        update_run_mark_failure(db, kwargs['run_id'])
    logger.warning("Marked run as failure")
Beispiel #20
0
def aggregate_comparisons(similarity_result_files, project_id, run_id, parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    if similarity_result_files is None: return
    mc = connect_to_object_store()
    files = []
    data_size = 0

    for num, filename in similarity_result_files:
        if num > 0:
            files.append(filename)
            data_size += mc.stat_object(Config.MINIO_BUCKET, filename).size

    log.debug("Aggregating result chunks from {} files, total size: {}".format(
        len(files), fmt_bytes(data_size)))

    result_file_stream_generator = (mc.get_object(Config.MINIO_BUCKET, result_filename) for result_filename in files)

    log.info("Similarity score results are {}".format(fmt_bytes(data_size)))
    result_stream = chain_streams(result_file_stream_generator)

    with DBConn() as db:
        result_type = get_project_column(db, project_id, 'result_type')

        # Note: Storing the similarity scores for all result types
        result_filename = store_similarity_scores(result_stream, run_id, data_size, db)

        if result_type == "similarity_scores":
            # Post similarity computation cleanup
            dp_ids = get_dataprovider_ids(db, project_id)

        else:
            # we promote the run to the next stage
            progress_stage(db, run_id)
            lenf1, lenf2 = get_project_dataset_sizes(db, project_id)

    # DB now committed, we can fire off tasks that depend on the new db state
    if result_type == "similarity_scores":
        log.info("Deleting intermediate similarity score files from object store")
        mc.remove_objects(Config.MINIO_BUCKET, files)
        log.debug("Removing clk filters from redis cache")
        remove_from_cache(dp_ids[0])
        remove_from_cache(dp_ids[1])

        # Complete the run
        log.info("Marking run as complete")
        mark_run_complete.delay(run_id, aggregate_comparisons.get_serialized_span())
    else:
        solver_task.delay(result_filename, project_id, run_id, lenf1, lenf2, aggregate_comparisons.get_serialized_span())
def prerun_check(project_id, run_id, parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    log.debug("Sanity check that we need to compute run")

    # being very defensive here checking if the run state is already in the redis cache
    if not is_run_missing(run_id):
        log.warning(
            "unexpectedly the run state is present in redis before starting")
        return

    with DBConn() as conn:
        if not check_project_exists(conn, project_id):
            log.debug("Project not found. Skipping")
            raise ProjectDeleted(project_id)

        res = get_run(conn, run_id)
        if res is None:
            log.debug(f"Run not found. Skipping")
            raise RunDeleted(run_id)

        try:
            db_state = get_run_state_for_update(conn, run_id)
        except psycopg2.OperationalError:
            log.warning("Run started in another task. Skipping this race.")
            return

        if db_state in {'running', 'completed', 'error'}:
            log.warning("Run already started. Skipping")
            return

        log.debug("Setting run state in db as 'running'")
        update_run_set_started(conn, run_id)

        log.debug("Updating redis cache for run")
        set_run_state_active(run_id)

    create_comparison_jobs.apply_async(kwargs={
        'project_id':
        project_id,
        'run_id':
        run_id,
        'parent_span':
        prerun_check.get_serialized_span()
    },
                                       link_error=run_failed_handler.s())
    log.info("CLK similarity computation scheduled")
Beispiel #22
0
def upload_json_clk_data(dp_id, clk_json, parent_span):
    """
    Convert user provided encodings from json array of base64 data into
    a newline separated file of base64 data.

    Note this implementation is non-streaming.
    """
    if 'clks' not in clk_json or len(clk_json['clks']) < 1:
        safe_fail_request(400, message="Missing CLKs information")

    receipt_token = generate_code()

    filename = Config.RAW_FILENAME_FMT.format(receipt_token)
    logger.info("Storing user {} supplied clks from json".format(dp_id))

    with opentracing.tracer.start_span('clk-splitting',
                                       child_of=parent_span) as span:
        count = len(clk_json['clks'])
        span.set_tag("clks", count)
        data = b''.join(''.join(clk.split('\n')).encode() + b'\n'
                        for clk in clk_json['clks'])

        num_bytes = len(data)
        span.set_tag("num_bytes", num_bytes)
        buffer = BytesIO(data)

    logger.info(
        f"Received {count} encodings. Uploading {fmt_bytes(num_bytes)} to object store"
    )
    with opentracing.tracer.start_span('save-to-quarantine',
                                       child_of=parent_span) as span:
        span.set_tag('filename', filename)
        mc = connect_to_object_store()
        mc.put_object(Config.MINIO_BUCKET,
                      filename,
                      data=buffer,
                      length=num_bytes)

    with opentracing.tracer.start_span('update-db',
                                       child_of=parent_span) as span:
        with DBConn() as conn:
            db.insert_encoding_metadata(conn, filename, dp_id, receipt_token,
                                        count)

    return receipt_token, filename
Beispiel #23
0
def project_delete(project_id):
    log = logger.bind(pid=project_id)
    log.info('Request to delete project')
    # Check the resource exists and hasn't already been marked for deletion
    abort_if_project_doesnt_exist(project_id)

    # Check the caller has a valid results token. Yes it should be renamed.
    abort_if_invalid_results_token(project_id,
                                   request.headers.get('Authorization'))
    log.info("Marking project for deletion")

    with DBConn() as db_conn:
        db.mark_project_deleted(db_conn, project_id)

    log.info("Queuing authorized request to delete project resources")
    remove_project.delay(project_id)

    return '', 204
Beispiel #24
0
def authorise_get_request(project_id):
    if request.headers is None or 'Authorization' not in request.headers:
        safe_fail_request(401, message="Authentication token required")
    auth_header = request.headers.get('Authorization')
    dp_id = None
    # Check the resource exists
    abort_if_project_doesnt_exist(project_id)
    with DBConn() as dbinstance:
        project_object = db.get_project(dbinstance, project_id)
    logger.info("Checking credentials")
    if project_object['result_type'] == 'mapping' or project_object[
            'result_type'] == 'similarity_scores':
        # Check the caller has a valid results token if we are including results
        abort_if_invalid_results_token(project_id, auth_header)
    elif project_object['result_type'] == 'permutations':
        dp_id = get_authorization_token_type_or_abort(project_id, auth_header)
    else:
        safe_fail_request(500, "Unknown error")
    return dp_id, project_object
def check_dataproviders_encoding(project_id, encoding_size):
    """
    Ensure that the provided encoding size is valid for the given project.

    :raises ValueError if encoding_size invalid.
    """
    if not config.MIN_ENCODING_SIZE <= encoding_size <= config.MAX_ENCODING_SIZE:
        raise InvalidEncodingError(
            dedent(f"""Encoding size out of bounds.
        Expected encoding size to be between {config.MIN_ENCODING_SIZE} and {config.MAX_ENCODING_SIZE}
        """))
    with DBConn() as db:
        project_encoding_size = get_project_schema_encoding_size(
            db, project_id)
    if project_encoding_size is not None and encoding_size != project_encoding_size:
        raise InvalidEncodingError(
            dedent(f"""User provided encodings were an invalid size
        Expected {project_encoding_size} but got {encoding_size}.
        """))
def save_and_permute(similarity_result, project_id, run_id, parent_span):
    log = logger.bind(pid=project_id, run_id=run_id)
    log.debug("Saving and possibly permuting data")
    groups = similarity_result['groups']

    # Note Postgres requires JSON object keys to be strings
    # Celery actually converts the json arguments in the same way

    with DBConn() as db:
        result_type = get_project_column(db, project_id, 'result_type')

        if result_type == "groups":
            # Save the raw groups
            log.debug("Saving the groups in the DB")
            result_id = insert_mapping_result(db, run_id, groups)
        else:
            # Turn groups into mapping and save that
            log.debug("Turning groups into mapping")
            mapping = groups_to_mapping(groups)
            log.debug("Saving mappuing in the DB")
            result_id = insert_mapping_result(db, run_id, mapping)

        dp_ids = get_dataprovider_ids(db, project_id)

    log.info("Result saved to db with result id {}".format(result_id))

    if result_type == "permutations":
        log.debug("Submitting job to permute mapping")
        dataset0_size, dataset1_size = similarity_result['datasetSizes']
        permute_mapping_data.apply_async(
            (project_id, run_id, dataset0_size, dataset1_size,
             save_and_permute.get_serialized_span()))
    else:
        log.debug("Mark job as complete")
        mark_run_complete.delay(run_id, save_and_permute.get_serialized_span())

    # Post similarity computation cleanup
    log.debug("Removing clk filters from redis cache")

    for dp_id in dp_ids:
        cache.remove_from_cache(dp_id)
    calculate_comparison_rate.delay()
Beispiel #27
0
def remove_project(project_id, parent_span=None):
    """

    """
    log = logger.bind(pid=project_id)
    log.debug("Remove all project resources")

    with DBConn() as conn:
        run_objects = db.get_runs(conn, project_id)
        log.debug("Setting run status as 'deleted'")
        for run in run_objects:
            set_run_state_deleted(run_id=run['run_id'])
        log.debug("Deleting project resourced from database")
        db.delete_project_data(conn, project_id)
        log.debug(
            "Getting object store files associated with project from database")
        object_store_files = db.get_all_objects_for_project(conn, project_id)

    delete_minio_objects.delay(object_store_files, project_id, parent_span)
    log.info("Project resources removed")
Beispiel #28
0
def project_get(project_id):
    """
    This endpoint describes a Project.
    """
    log = logger.bind(pid=project_id)
    log.info("Getting detail for a project")
    abort_if_project_doesnt_exist(project_id)
    authorise_get_request(project_id)
    with DBConn() as db_conn:
        project_object = db.get_project(db_conn, project_id)
        # Expose the number of data providers who have uploaded clks
        parties_contributed = db.get_number_parties_uploaded(
            db_conn, project_id)
        num_parties_with_error = db.get_encoding_error_count(
            db_conn, project_id)
    log.info(f"{parties_contributed} parties have contributed hashes")
    project_object['parties_contributed'] = parties_contributed

    if num_parties_with_error > 0:
        log.warning(
            f"There are {num_parties_with_error} parties in error state")
    project_object['error'] = num_parties_with_error > 0

    return ProjectDescription().dump(project_object)
Beispiel #29
0
def project_clks_post(project_id):
    """
    Update a project to provide encoded PII data.
    """
    log = logger.bind(pid=project_id)
    headers = request.headers

    parent_span = g.flask_tracer.get_span()

    with opentracing.tracer.start_span('check-auth',
                                       child_of=parent_span) as span:
        abort_if_project_doesnt_exist(project_id)
        if headers is None or 'Authorization' not in headers:
            safe_fail_request(401, message="Authentication token required")

        token = headers['Authorization']

        # Check the caller has valid token -> otherwise 403
        abort_if_invalid_dataprovider_token(token)

    with DBConn() as conn:
        dp_id = db.get_dataprovider_id(conn, token)
        project_encoding_size = db.get_project_schema_encoding_size(
            get_db(), project_id)

    log = log.bind(dp_id=dp_id)
    log.info("Receiving CLK data.")
    receipt_token = None

    with opentracing.tracer.start_span('upload-data',
                                       child_of=parent_span) as span:
        span.set_tag("project_id", project_id)
        if headers['Content-Type'] == "application/json":
            span.set_tag("content-type", 'json')
            # TODO: Previously, we were accessing the CLKs in a streaming fashion to avoid parsing the json in one hit. This
            #       enables running the web frontend with less memory.
            #       However, as connexion is very, very strict about input validation when it comes to json, it will always
            #       consume the stream first to validate it against the spec. Thus the backflip to fully reading the CLks as
            #       json into memory. -> issue #184

            receipt_token, raw_file = upload_json_clk_data(
                dp_id, get_json(), span)
            # Schedule a task to deserialize the hashes, and carry
            # out a pop count.
            handle_raw_upload.delay(project_id,
                                    dp_id,
                                    receipt_token,
                                    parent_span=serialize_span(span))
            log.info("Job scheduled to handle user uploaded hashes")
        elif headers['Content-Type'] == "application/octet-stream":
            span.set_tag("content-type", 'binary')
            log.info("Handling binary CLK upload")
            try:
                count, size = check_binary_upload_headers(headers)
                log.info(
                    f"Headers tell us to expect {count} encodings of {size} bytes"
                )
                span.log_kv({'count': count, 'size': size})
            except Exception:
                log.warning(
                    "Upload failed due to problem with headers in binary upload"
                )
                raise
            # Check against project level encoding size (if it has been set)
            if project_encoding_size is not None and size != project_encoding_size:
                # fail fast - we haven't stored the encoded data yet
                return safe_fail_request(
                    400, "Upload 'Hash-Size' doesn't match project settings")

            # TODO actually stream the upload data straight to Minio. Currently we can't because
            # connexion has already read the data before our handler is called!
            # https://github.com/zalando/connexion/issues/592
            # stream = get_stream()
            stream = BytesIO(request.data)
            log.debug(
                f"Stream size is {len(request.data)} B, and we expect {(6 + size)* count} B"
            )
            if len(request.data) != (6 + size) * count:
                safe_fail_request(
                    400,
                    "Uploaded data did not match the expected size. Check request headers are correct"
                )
            try:
                receipt_token = upload_clk_data_binary(project_id, dp_id,
                                                       stream, count, size)
            except ValueError:
                safe_fail_request(
                    400,
                    "Uploaded data did not match the expected size. Check request headers are correct."
                )
        else:
            safe_fail_request(400, "Content Type not supported")

    return {'message': 'Updated', 'receipt_token': receipt_token}, 201
def handle_invalid_encoding_data(project_id, dp_id):
    with DBConn() as conn:
        filename, _ = get_filter_metadata(conn, dp_id)
        update_encoding_metadata(conn, 'DELETED', dp_id, state='error')
    if filename is not None:
        delete_minio_objects.delay([filename], project_id)