Esempio n. 1
0
def get_similarity_scores(filename):
    """
    Read a CSV file from the object store containing the similarity scores and return
    a response that will stream the similarity scores.

    :param filename: name of the CSV file, obtained from the `similarity_scores` table
    :return: the similarity scores in a streaming JSON response.
    """

    mc = connect_to_object_store()

    details = mc.stat_object(config.MINIO_BUCKET, filename)
    logger.info("Starting download stream of similarity scores.",
                filename=filename,
                filesize=details.size)

    try:
        sims_data_stream = mc.get_object(config.MINIO_BUCKET, filename)
        # TODO: Below is an Anonlink 'private' API. It should be made
        # public.
        sims_iter, *_ = anonlink.serialization._load_to_iterable(
            sims_data_stream)

        return Response(generate_scores(sims_iter),
                        mimetype='application/json')

    except urllib3.exceptions.ResponseError:
        logger.warning(
            "Attempt to read the similarity scores file failed with an error response.",
            filename=filename)
        safe_fail_request(500, "Failed to retrieve similarity scores")
Esempio n. 2
0
def get(project_id, run_id):
    log = logger.bind(pid=project_id, rid=run_id)
    log.info("Checking for results of run.")
    parent_span = g.flask_tracer.get_span()
    with opentracing.tracer.start_span('check-auth',
                                       child_of=parent_span) as span:
        # Check the project and run resources exist
        abort_if_run_doesnt_exist(project_id, run_id)
        # Check the caller has a valid results token.
        token = request.headers.get('Authorization')
        log.info("request to access run result authorized")

    with opentracing.tracer.start_span('get-run-state',
                                       child_of=parent_span) as span:
        dbinstance = db.get_db()
        state = db.get_run_state(dbinstance, run_id)
        log.info("run state is '{}'".format(state))

    # Check that the run is not in a terminal state, otherwise 404
    if state == 'completed':
        with opentracing.tracer.start_span('get-run-result',
                                           child_of=parent_span) as span:
            return get_result(dbinstance, project_id, run_id, token)
    elif state == 'error':
        safe_fail_request(500, message='Error during computation of run')
    else:
        safe_fail_request(404, message='run is not complete')
def status_get():
    """Displays the latest mapping statistics"""

    status = service_status.get_status()

    if status is None:
        # We ensure we can connect to the database during the status check
        with db.DBConn() as conn:
            try:
                number_of_mappings = db.query_db(conn, '''
                            SELECT COUNT(*) FROM projects
                            ''', one=True)['count']
            except psycopg2.errors.UndefinedTable:
                safe_fail_request(500, "DB uninitialized")

            current_rate = db.get_latest_rate(conn)

        status = {
            'status': 'ok',
            'project_count': number_of_mappings,
            'rate': current_rate
        }

        service_status.set_status(status)
    return status
def abort_if_invalid_dataprovider_token(update_token):
    logger.debug("checking authorization token to upload data")
    with DBConn() as conn:
        resource_exists = db.check_update_auth(conn, update_token)
    if not resource_exists:
        logger.debug("authorization token invalid")
        safe_fail_request(403, message=INVALID_ACCESS_MSG)
Esempio n. 5
0
def get_result(dbinstance, project_id, run_id, token):
    result_type = db.get_project_column(dbinstance, project_id, 'result_type')
    auth_token_type = get_authorization_token_type_or_abort(project_id, token)

    if result_type == 'mapping':
        logger.info("Mapping result being returned")
        result = db.get_run_result(dbinstance, run_id)
        return {"mapping": result}

    elif result_type == 'groups':
        logger.info("Groups result being returned")
        result = db.get_run_result(dbinstance, run_id)
        return {"groups": result}

    elif result_type == 'similarity_scores':
        logger.info("Similarity result being returned")
        return get_similarity_score_result(dbinstance, run_id)

    elif result_type == 'permutations':
        logger.info("Permutation result being returned")
        return get_permutations_result(project_id, run_id, dbinstance, token,
                                       auth_token_type)
    else:
        logger.warning("Unimplemented result type")
        safe_fail_request(500, message='Project has unknown result type')
def get_authorization_token_type_or_abort(project_id, token):
    """
    In case of a permutation with an unencrypted mask, we are using both the result token and the receipt tokens.
    The result token reveals the mask. The receipts tokens are used by the dataproviders to get their permutations.
    However, we do not know the type of token we have before checking.
    """
    logger.debug("checking if provided authorization is a results_token")
    # If the token is not a valid result token, it should be a receipt token.
    if not is_results_token_valid(project_id, token):
        logger.debug("checking if provided authorization is receipt_token")
        # If the token is not a valid receipt token, we abort.
        if not is_receipt_token_valid(project_id, token):
            safe_fail_request(403, message=INVALID_ACCESS_MSG)
        token_type = 'receipt_token'
    else:
        token_type = 'result_token'

    # Note that at this stage we have EITHER a receipt or result token, and depending on the result_type
    # that might mean the caller is not authorized.
    with DBConn() as conn:
        result_type = get_project_column(conn, project_id, 'result_type')
    if result_type in {'groups', 'similarity_scores'
                       } and token_type == 'receipt_token':
        logger.info("Caller provided receipt token to get results")
        safe_fail_request(403, message=INVALID_ACCESS_MSG)
    return token_type
def abort_if_run_doesnt_exist(project_id, run_id):
    with DBConn() as conn:
        resource_exists = db.check_run_exists(conn, project_id, run_id)
    if not resource_exists:
        logger.info(
            "Requested project or run resource with invalid identifier token")
        safe_fail_request(403, message=INVALID_ACCESS_MSG)
Esempio n. 8
0
def get_similarity_scores(filename):
    """
    Read a CSV file from the object store containing the similarity scores and return
    a response that will stream the similarity scores.

    :param filename: name of the CSV file, obtained from the `similarity_scores` table
    :return: the similarity scores in a streaming JSON response.
    """

    mc = connect_to_object_store()

    details = mc.stat_object(config.MINIO_BUCKET, filename)
    logger.info("Starting download stream of similarity scores.",
                filename=filename,
                filesize=details.size)

    try:
        candidate_pair_binary_stream = mc.get_object(config.MINIO_BUCKET,
                                                     filename)

        return Response(generate_scores(candidate_pair_binary_stream),
                        mimetype='application/json')

    except urllib3.exceptions.ResponseError:
        logger.warning(
            "Attempt to read the similarity scores file failed with an error response.",
            filename=filename)
        safe_fail_request(500, "Failed to retrieve similarity scores")
Esempio n. 9
0
def projects_post(project):
    """Create a new project

    There are multiple result types, see documentation for how these effect information leakage
    and the resulting data.
    """
    logger.debug("Processing request to add a new project", project=project)
    try:
        project_model = models.Project.from_json(project)
    except models.InvalidProjectParametersException as e:
        logger.info(f"Denied request to add a new project - {e.msg}",
                    project=project)
        safe_fail_request(400, message=e.msg)

    # Persist the new project
    log = logger.bind(pid=project_model.project_id)
    log.info("Adding new project to database")
    try:
        with DBConn() as conn:
            project_model.save(conn)
    except Exception as e:
        log.warn(e)
        safe_fail_request(500, 'Problem creating new project')

    return NewProjectResponse().dump(project_model), 201
def dataprovider_id_if_authorize(resource_id, receipt_token):
    logger.debug("checking authorization token to fetch mask data")
    if not is_receipt_token_valid(resource_id, receipt_token):
        safe_fail_request(403, message=INVALID_ACCESS_MSG)

    dp_id = db.select_dataprovider_id(get_db(), resource_id, receipt_token)
    return dp_id
Esempio n. 11
0
def get_similarity_score_result(dbinstance, run_id):
    logger.info("Similarity score result being returned")
    try:
        filename = db.get_similarity_scores_filename(dbinstance, run_id)
        return get_similarity_scores(filename)

    except TypeError:
        logger.exception("Couldn't find the similarity score file for the runId %s", run_id)
        safe_fail_request(500, "Failed to retrieve similarity scores")
Esempio n. 12
0
def authorize_run_listing(project_id):
    logger.info("Looking up project")
    # Check the project resource exists
    abort_if_project_doesnt_exist(project_id)
    if request.headers is None or 'Authorization' not in request.headers:
        safe_fail_request(401, message="Authentication token required")
    auth_header = request.headers.get('Authorization')
    logger.info("Checking credentials to list project runs")
    # Check the caller has a valid results token (analyst token)
    abort_if_invalid_results_token(project_id, auth_header)
    logger.info("Caller is allowed to list project runs")
def authorize_external_upload(project_id):
    if not config.UPLOAD_OBJECT_STORE_ENABLED:
        safe_fail_request(
            500,
            message=
            "Retrieving temporary object store credentials feature disabled",
            title="Feature Disabled")

    headers = request.headers

    log, parent_span = bind_log_and_span(project_id)

    log.debug("Authorizing external upload")
    token = precheck_upload_token(project_id, headers, parent_span)
    log.debug(f"Update token is valid")
    with db.DBConn() as conn:
        dp_id = db.get_dataprovider_id(conn, token)
        log = log.bind(dpid=dp_id)

    with opentracing.tracer.start_span('assume-role-request',
                                       child_of=parent_span):
        client = connect_to_upload_object_store()
        client.set_app_info("anonlink", "development version")

        bucket_name = config.UPLOAD_OBJECT_STORE_BUCKET
        path = object_store_upload_path(project_id, dp_id)
        log.info(
            f"Retrieving temporary object store credentials for path: '{bucket_name}/{path}'"
        )

        credentials_provider = AssumeRoleProvider(
            client,
            Policy=_get_upload_policy(bucket_name, path=path),
            DurationSeconds=config.UPLOAD_OBJECT_STORE_STS_DURATION)
        credential_values = Credentials(provider=credentials_provider).get()
        expiry = credentials_provider._expiry._expiration

        log.info("Retrieved temporary credentials")

    credentials_json = ObjectStoreCredentials().dump(credential_values)
    log.debug("Temp credentials", **credentials_json)

    # Convert datetime to ISO 8601 string
    credentials_json["Expiration"] = expiry.strftime('%Y-%m-%dT%H:%M:%S.%f%z')

    return {
        "credentials": credentials_json,
        "upload": {
            "endpoint": config.UPLOAD_OBJECT_STORE_SERVER,
            "secure": config.UPLOAD_OBJECT_STORE_SECURE,
            "bucket": bucket_name,
            "path": path
        }
    }, 201
Esempio n. 14
0
def precheck_upload_token(project_id, headers, parent_span):
    """
    Raise a `ProblemException` if the project doesn't exist or the
    authentication token passed in the headers isn't valid.
    """
    with opentracing.tracer.start_span('check-auth',
                                       child_of=parent_span) as span:
        abort_if_project_doesnt_exist(project_id)
        if headers is None or 'Authorization' not in headers:
            safe_fail_request(401, message="Authentication token required")

        token = headers['Authorization']

        # Check the caller has valid token -> otherwise 403
        abort_if_invalid_dataprovider_token(token)
    return token
Esempio n. 15
0
def upload_json_clk_data(dp_id, clk_json, parent_span):
    """
    Convert user provided encodings from json array of base64 data into
    a newline separated file of base64 data.

    Note this implementation is non-streaming.
    """
    if 'clks' not in clk_json or len(clk_json['clks']) < 1:
        safe_fail_request(400, message="Missing CLKs information")

    receipt_token = generate_code()

    filename = Config.RAW_FILENAME_FMT.format(receipt_token)
    logger.info("Storing user {} supplied clks from json".format(dp_id))

    with opentracing.tracer.start_span('clk-splitting',
                                       child_of=parent_span) as span:
        count = len(clk_json['clks'])
        span.set_tag("clks", count)
        data = b''.join(''.join(clk.split('\n')).encode() + b'\n'
                        for clk in clk_json['clks'])

        num_bytes = len(data)
        span.set_tag("num_bytes", num_bytes)
        buffer = BytesIO(data)

    logger.info(
        f"Received {count} encodings. Uploading {fmt_bytes(num_bytes)} to object store"
    )
    with opentracing.tracer.start_span('save-to-quarantine',
                                       child_of=parent_span) as span:
        span.set_tag('filename', filename)
        mc = connect_to_object_store()
        mc.put_object(Config.MINIO_BUCKET,
                      filename,
                      data=buffer,
                      length=num_bytes)

    with opentracing.tracer.start_span('update-db',
                                       child_of=parent_span) as span:
        with DBConn() as conn:
            db.insert_encoding_metadata(conn, filename, dp_id, receipt_token,
                                        count)

    return receipt_token, filename
Esempio n. 16
0
def authorise_get_request(project_id):
    if request.headers is None or 'Authorization' not in request.headers:
        safe_fail_request(401, message="Authentication token required")
    auth_header = request.headers.get('Authorization')
    dp_id = None
    # Check the resource exists
    abort_if_project_doesnt_exist(project_id)
    with DBConn() as dbinstance:
        project_object = db.get_project(dbinstance, project_id)
    logger.info("Checking credentials")
    if project_object['result_type'] == 'mapping' or project_object[
            'result_type'] == 'similarity_scores':
        # Check the caller has a valid results token if we are including results
        abort_if_invalid_results_token(project_id, auth_header)
    elif project_object['result_type'] == 'permutations':
        dp_id = get_authorization_token_type_or_abort(project_id, auth_header)
    else:
        safe_fail_request(500, "Unknown error")
    return dp_id, project_object
Esempio n. 17
0
def get_permutations_result(project_id, run_id, dbinstance, token,
                            auth_token_type):
    logger.info("Permutations and mask result type being returned")
    if auth_token_type == 'receipt_token':
        logger.debug("auth type receipt_token")
        dp_id = db.select_dataprovider_id(dbinstance, project_id, token)
        perm = db.get_permutation_result(dbinstance, dp_id, run_id)
        rows = db.get_smaller_dataset_size_for_project(dbinstance, project_id)
        result = {'permutation': perm, 'rows': rows}
    elif auth_token_type == "result_token":
        logger.debug("auth type result_token")
        logger.info("Returning unencrypted mask to coordinator")
        # The mask is a json blob of an
        # array of 0/1 ints
        mask = db.get_permutation_unencrypted_mask(dbinstance, project_id,
                                                   run_id)
        result = {"mask": mask}
    else:
        logger.warning("Didn't recognize the auth token type of {}".format(
            auth_token_type))
        safe_fail_request(500,
                          "Unknown error. Please report to the developers")
    return result
Esempio n. 18
0
def check_binary_upload_headers(headers):
    if not all(extra_header in headers
               for extra_header in {'Hash-Count', 'Hash-Size'}):
        safe_fail_request(
            400, "Binary upload requires 'Hash-Count' and 'Hash-Size' headers")

    def get_header_int(header, min=None, max=None):
        INVALID_HEADER_NUMBER = "Invalid value for {} header".format(header)
        try:
            value = int(headers[header])
            if min is not None and value < min:
                safe_fail_request(400, INVALID_HEADER_NUMBER)
            if max is not None and value > max:
                safe_fail_request(400, INVALID_HEADER_NUMBER)
            return value
        except ValueError:
            safe_fail_request(400, INVALID_HEADER_NUMBER)

    size = get_header_int('Hash-Size',
                          min=Config.MIN_ENCODING_SIZE,
                          max=Config.MAX_ENCODING_SIZE)
    count = get_header_int('Hash-Count', min=1)

    return count, size
Esempio n. 19
0
 def get_header_int(header, min=None, max=None):
     INVALID_HEADER_NUMBER = "Invalid value for {} header".format(header)
     try:
         value = int(headers[header])
         if min is not None and value < min:
             safe_fail_request(400, INVALID_HEADER_NUMBER)
         if max is not None and value > max:
             safe_fail_request(400, INVALID_HEADER_NUMBER)
         return value
     except ValueError:
         safe_fail_request(400, INVALID_HEADER_NUMBER)
Esempio n. 20
0
def project_binaryclks_post(project_id):
    """
    Update a project to provide encoded PII data.
    """
    log, parent_span = bind_log_and_span(project_id)
    headers = request.headers
    token = precheck_upload_token(project_id, headers, parent_span)

    with DBConn() as conn:
        dp_id = db.get_dataprovider_id(conn, token)
        project_encoding_size = db.get_project_schema_encoding_size(
            conn, project_id)
        upload_state_updated = db.is_dataprovider_allowed_to_upload_and_lock(
            conn, dp_id)

    if not upload_state_updated:
        return safe_fail_request(
            403, "This token has already been used to upload clks.")

    log = log.bind(dp_id=dp_id)
    log.info("Receiving CLK data.")
    receipt_token = generate_code()

    with opentracing.tracer.start_span('upload-clk-data',
                                       child_of=parent_span) as span:
        span.set_tag("project_id", project_id)
        try:
            if headers['Content-Type'] == "application/octet-stream":
                span.set_tag("content-type", 'binary')
                log.info("Handling binary CLK upload")
                try:
                    count, size = check_binary_upload_headers(headers)
                    log.info(
                        f"Headers tell us to expect {count} encodings of {size} bytes"
                    )
                    span.log_kv({'count': count, 'size': size})
                except Exception:
                    log.warning(
                        "Upload failed due to problem with headers in binary upload"
                    )
                    raise
                # Check against project level encoding size (if it has been set)
                if project_encoding_size is not None and size != project_encoding_size:
                    # fail fast - we haven't stored the encoded data yet
                    return safe_fail_request(
                        400,
                        "Upload 'Hash-Size' doesn't match project settings")

                # TODO actually stream the upload data straight to Minio. Currently we can't because
                # connexion has already read the data before our handler is called!
                # https://github.com/zalando/connexion/issues/592
                # stream = get_stream()
                stream = BytesIO(request.data)

                converted_stream = include_encoding_id_in_binary_stream(
                    stream, size, count)

                expected_bytes = size * count
                log.debug(
                    f"Stream size is {len(request.data)} B, and we expect {expected_bytes} B"
                )
                if len(request.data) != expected_bytes:
                    safe_fail_request(
                        400,
                        "Uploaded data did not match the expected size. Check request headers are correct"
                    )
                try:
                    upload_clk_data_binary(project_id, dp_id, converted_stream,
                                           receipt_token, count, size)
                except ValueError:
                    safe_fail_request(
                        400,
                        "Uploaded data did not match the expected size. Check request headers are correct."
                    )
            else:
                safe_fail_request(400, "Content Type not supported")
        except Exception:
            log.warning(
                "The dataprovider was not able to upload their clks,"
                " re-enable the corresponding upload token to be used.")

            with DBConn() as conn:
                db.set_dataprovider_upload_state(conn, dp_id, state='error')
            raise
    with DBConn() as conn:
        db.set_dataprovider_upload_state(conn, dp_id, state='done')

    # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id):
        logger.info("All parties data present. Scheduling any queued runs")
        check_for_executable_runs.delay(project_id,
                                        serialize_span(parent_span))

    return {'message': 'Updated', 'receipt_token': receipt_token}, 201
Esempio n. 21
0
def project_clks_post(project_id):
    """
    Update a project to provide encoded PII data.
    """
    log = logger.bind(pid=project_id)
    headers = request.headers

    parent_span = g.flask_tracer.get_span()

    with opentracing.tracer.start_span('check-auth',
                                       child_of=parent_span) as span:
        abort_if_project_doesnt_exist(project_id)
        if headers is None or 'Authorization' not in headers:
            safe_fail_request(401, message="Authentication token required")

        token = headers['Authorization']

        # Check the caller has valid token -> otherwise 403
        abort_if_invalid_dataprovider_token(token)

    with DBConn() as conn:
        dp_id = db.get_dataprovider_id(conn, token)
        project_encoding_size = db.get_project_schema_encoding_size(
            get_db(), project_id)

    log = log.bind(dp_id=dp_id)
    log.info("Receiving CLK data.")
    receipt_token = None

    with opentracing.tracer.start_span('upload-data',
                                       child_of=parent_span) as span:
        span.set_tag("project_id", project_id)
        if headers['Content-Type'] == "application/json":
            span.set_tag("content-type", 'json')
            # TODO: Previously, we were accessing the CLKs in a streaming fashion to avoid parsing the json in one hit. This
            #       enables running the web frontend with less memory.
            #       However, as connexion is very, very strict about input validation when it comes to json, it will always
            #       consume the stream first to validate it against the spec. Thus the backflip to fully reading the CLks as
            #       json into memory. -> issue #184

            receipt_token, raw_file = upload_json_clk_data(
                dp_id, get_json(), span)
            # Schedule a task to deserialize the hashes, and carry
            # out a pop count.
            handle_raw_upload.delay(project_id,
                                    dp_id,
                                    receipt_token,
                                    parent_span=serialize_span(span))
            log.info("Job scheduled to handle user uploaded hashes")
        elif headers['Content-Type'] == "application/octet-stream":
            span.set_tag("content-type", 'binary')
            log.info("Handling binary CLK upload")
            try:
                count, size = check_binary_upload_headers(headers)
                log.info(
                    f"Headers tell us to expect {count} encodings of {size} bytes"
                )
                span.log_kv({'count': count, 'size': size})
            except Exception:
                log.warning(
                    "Upload failed due to problem with headers in binary upload"
                )
                raise
            # Check against project level encoding size (if it has been set)
            if project_encoding_size is not None and size != project_encoding_size:
                # fail fast - we haven't stored the encoded data yet
                return safe_fail_request(
                    400, "Upload 'Hash-Size' doesn't match project settings")

            # TODO actually stream the upload data straight to Minio. Currently we can't because
            # connexion has already read the data before our handler is called!
            # https://github.com/zalando/connexion/issues/592
            # stream = get_stream()
            stream = BytesIO(request.data)
            log.debug(
                f"Stream size is {len(request.data)} B, and we expect {(6 + size)* count} B"
            )
            if len(request.data) != (6 + size) * count:
                safe_fail_request(
                    400,
                    "Uploaded data did not match the expected size. Check request headers are correct"
                )
            try:
                receipt_token = upload_clk_data_binary(project_id, dp_id,
                                                       stream, count, size)
            except ValueError:
                safe_fail_request(
                    400,
                    "Uploaded data did not match the expected size. Check request headers are correct."
                )
        else:
            safe_fail_request(400, "Content Type not supported")

    return {'message': 'Updated', 'receipt_token': receipt_token}, 201
Esempio n. 22
0
def project_clks_post(project_id):
    """
    Update a project to provide encoded PII data.
    """

    headers = request.headers

    log, parent_span = bind_log_and_span(project_id)
    log.debug("Starting data upload request")
    token = precheck_upload_token(project_id, headers, parent_span)
    receipt_token = generate_code()
    with DBConn() as conn:
        dp_id = db.get_dataprovider_id(conn, token)
        project_encoding_size = db.get_project_schema_encoding_size(
            conn, project_id)
        upload_state_updated = db.is_dataprovider_allowed_to_upload_and_lock(
            conn, dp_id)
        # get flag use_blocking from table projects
        uses_blocking = get_project_column(conn, project_id, 'uses_blocking')

    if not upload_state_updated:
        return safe_fail_request(
            403, "This token has already been used to upload clks.")

    log = log.bind(dp_id=dp_id)
    log.info("Receiving CLK data.")

    with opentracing.tracer.start_span('upload-clk-data',
                                       child_of=parent_span) as span:
        span.set_tag("project_id", project_id)
        try:
            if headers['Content-Type'] == "application/json":
                span.set_tag("content-type", 'json')
                # TODO: Previously, we were accessing the CLKs in a streaming fashion to avoid parsing the json in one hit. This
                #       enables running the web frontend with less memory.
                #       However, as connexion is very, very strict about input validation when it comes to json, it will always
                #       consume the stream first to validate it against the spec. Thus the backflip to fully reading the CLks as
                #       json into memory. -> issue #184
                handle_encoding_upload_json(project_id,
                                            dp_id,
                                            get_json(),
                                            receipt_token,
                                            uses_blocking,
                                            parent_span=span)

                log.info("Job scheduled to handle users upload")
            elif headers['Content-Type'] == "application/octet-stream":
                span.set_tag("content-type", 'binary')
                log.info("Handling binary CLK upload")
                try:
                    count, size = check_binary_upload_headers(headers)
                    log.info(
                        f"Headers tell us to expect {count} encodings of {size} bytes"
                    )
                    span.log_kv({'count': count, 'size': size})
                except Exception:
                    log.warning(
                        "Upload failed due to problem with headers in binary upload"
                    )
                    raise
                # Check against project level encoding size (if it has been set)
                if project_encoding_size is not None and size != project_encoding_size:
                    # fail fast - we haven't stored the encoded data yet
                    return safe_fail_request(
                        400,
                        "Upload 'Hash-Size' doesn't match project settings")

                # TODO actually stream the upload data straight to Minio. Currently we can't because
                # connexion has already read the data before our handler is called!
                # https://github.com/zalando/connexion/issues/592
                # stream = get_stream()
                stream = BytesIO(request.data)
                expected_bytes = binary_format(size).size * count
                log.debug(
                    f"Stream size is {len(request.data)} B, and we expect {expected_bytes} B"
                )
                if len(request.data) != expected_bytes:
                    safe_fail_request(
                        400,
                        "Uploaded data did not match the expected size. Check request headers are correct"
                    )
                try:
                    upload_clk_data_binary(project_id, dp_id, stream,
                                           receipt_token, count, size)
                except ValueError:
                    safe_fail_request(
                        400,
                        "Uploaded data did not match the expected size. Check request headers are correct."
                    )
            else:
                safe_fail_request(400, "Content Type not supported")
        except ProblemException as e:
            # Have an exception that is safe for the user. We reset the upload state to
            # allow the user to try upload again.
            log.info(
                f"Problem occurred, returning status={e.status} - {e.detail}")
            with DBConn() as conn:
                db.set_dataprovider_upload_state(conn,
                                                 dp_id,
                                                 state='not_started')
            raise
        except Exception as e:
            log.warning("Unhandled error occurred during data upload")
            log.exception(e)
            with DBConn() as conn:
                db.set_dataprovider_upload_state(conn, dp_id, state='error')
            safe_fail_request(
                500, "Sorry, the server couldn't handle that request")

    with DBConn() as conn:
        db.set_dataprovider_upload_state(conn, dp_id, state='done')

    # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id):
        logger.info("All parties data present. Scheduling any queued runs")
        check_for_executable_runs.delay(project_id,
                                        serialize_span(parent_span))

    return {'message': 'Updated', 'receipt_token': receipt_token}, 201
Esempio n. 23
0
def handle_encoding_upload_json(project_id, dp_id, clk_json, receipt_token,
                                uses_blocking, parent_span):
    """
    Take user provided upload information - accepting multiple formats - and eventually
    injest into the database.

    Encodings uploaded directly in the JSON are first quarantined in the object store,
    and a background task deserializes them.

    Encodings that are in an object store are streamed directly into the database by
    a background task.
    """
    log = logger.bind(pid=project_id)
    log.info("Checking json is consistent")
    try:
        abort_if_inconsistent_upload(uses_blocking, clk_json)
    except ValueError as e:
        safe_fail_request(403, e.args[0])

    if "encodings" in clk_json and 'file' in clk_json['encodings']:
        # external encodings
        log.info("External encodings uploaded")
        encoding_object_info = clk_json['encodings']['file']
        object_name = encoding_object_info['path']
        _check_object_path_allowed(project_id, dp_id, object_name, log)

        encoding_credentials = clk_json['encodings'].get('credentials')
        # Schedule a background task to pull the encodings from the object store
        # This background task updates the database with encoding metadata assuming
        # that there are no blocks.
        if 'blocks' not in clk_json:
            log.info("scheduling task to pull encodings from object store")
            pull_external_data_encodings_only.delay(
                project_id,
                dp_id,
                encoding_object_info,
                encoding_credentials,
                receipt_token,
                parent_span=serialize_span(parent_span))
        else:
            # Need to deal with both encodings and blocks
            if 'file' in clk_json['blocks']:
                object_name = clk_json['blocks']['file']['path']
                _check_object_path_allowed(project_id, dp_id, object_name, log)
                # Blocks are in an external file
                blocks_object_info = clk_json['blocks']['file']
                blocks_credentials = clk_json['blocks'].get('credentials')
                log.info(
                    "scheduling task to pull both encodings and blocking data from object store"
                )
                pull_external_data.delay(
                    project_id,
                    dp_id,
                    encoding_object_info,
                    encoding_credentials,
                    blocks_object_info,
                    blocks_credentials,
                    receipt_token,
                    parent_span=serialize_span(parent_span))
            else:
                raise NotImplementedError(
                    "Don't currently handle combination of external encodings and blocks"
                )

        return

    # Convert uploaded JSON to common schema.
    #
    # The original JSON API simply accepted "clks", then came a combined encoding and
    # blocking API expecting the top level element "clknblocks". Finally an API that
    # specifies both "encodings" and "blocks" independently at the top level.
    #
    # We rewrite all into the "clknblocks" format.
    if "encodings" in clk_json:
        logger.debug(
            "converting from 'encodings' & 'blocks' format to 'clknblocks'")
        clk_json = convert_encoding_upload_to_clknblock(clk_json)

    is_valid_clks = not uses_blocking and 'clks' in clk_json
    element = 'clks' if is_valid_clks else 'clknblocks'

    if len(clk_json[element]) < 1:
        safe_fail_request(400, message="Missing CLKs information")

    filename = Config.RAW_FILENAME_FMT.format(receipt_token)
    logger.info("Storing user {} supplied {} from json".format(dp_id, element))

    with opentracing.tracer.start_span('splitting-json-clks',
                                       child_of=parent_span) as span:
        encoding_count = len(clk_json[element])
        span.set_tag(element, encoding_count)
        logger.debug(f"Received {encoding_count} {element}")

    if element == 'clks':
        logger.info("Rewriting provided json into clknsblocks format")
        clk_json = convert_clks_to_clknblocks(clk_json)
        element = 'clknblocks'

    logger.info("Counting block sizes and number of blocks")
    # {'clknblocks': [['UG9vcA==', '001', '211'], [...]]}
    block_sizes = {}
    for _, *elements_blocks in clk_json[element]:
        for el_block in elements_blocks:
            block_sizes[el_block] = block_sizes.setdefault(el_block, 0) + 1
    block_count = len(block_sizes)

    logger.info(f"Received {encoding_count} encodings in {block_count} blocks")
    for block in block_sizes:
        logger.info(f"Block {block} has {block_sizes[block]} elements")

    # write clk_json into a temp file
    tmp = tempfile.NamedTemporaryFile(mode='w')
    json.dump(clk_json, tmp)
    tmp.flush()
    with opentracing.tracer.start_span('save-clk-file-to-quarantine',
                                       child_of=parent_span) as span:
        span.set_tag('filename', filename)
        mc = connect_to_object_store()
        mc.fput_object(Config.MINIO_BUCKET,
                       filename,
                       tmp.name,
                       content_type='application/json')
    logger.info('Saved uploaded {} JSON to file {} in object store.'.format(
        element.upper(), filename))

    with opentracing.tracer.start_span('update-encoding-metadata',
                                       child_of=parent_span):
        with DBConn() as conn:
            db.insert_encoding_metadata(conn, filename, dp_id, receipt_token,
                                        encoding_count, block_count)
            db.insert_blocking_metadata(conn, dp_id, block_sizes)

    # Schedule a task to deserialize the encodings
    handle_raw_upload.delay(project_id,
                            dp_id,
                            receipt_token,
                            parent_span=serialize_span(parent_span))
Esempio n. 24
0
def _check_object_path_allowed(project_id, dp_id, object_name, log):
    if not object_name.startswith(object_store_upload_path(project_id, dp_id)):
        log.warning(f"Attempt to upload to illegal path: {object_name}")
        safe_fail_request(403, "Provided object store path is not allowed")
def abort_if_project_in_error_state(project_id):
    conn = get_db()
    num_parties_with_error = db.get_encoding_error_count(conn, project_id)
    if num_parties_with_error > 0:
        safe_fail_request(500, message="Can't post run as project has errors")
def abort_if_project_doesnt_exist(project_id):
    conn = get_db()
    resource_exists = db.check_project_exists(conn, project_id)
    if not resource_exists:
        logger.info("Requested project resource with invalid identifier token")
        safe_fail_request(403, message=INVALID_ACCESS_MSG)
def abort_if_invalid_dataprovider_token(update_token):
    logger.debug("checking authorization token to update data")
    resource_exists = db.check_update_auth(get_db(), update_token)
    if not resource_exists:
        safe_fail_request(403, message=INVALID_ACCESS_MSG)
def abort_if_invalid_results_token(resource_id, results_token):
    logger.debug("checking authorization of 'result_token'")
    if not is_results_token_valid(resource_id, results_token):
        logger.debug("Authorization denied")
        safe_fail_request(403, message=INVALID_ACCESS_MSG)