Esempio n. 1
0
 def encoding_iterator(encoding_stream):
     binary_formatter = binary_format(size)
     for encoding_id in range(count):
         yield (
             str(encoding_id),
             binary_formatter.pack(encoding_id, encoding_stream.read(size)),
             encoding_to_block_map[str(encoding_id)]
             )
 def ijson_encoding_iterator(encoding_stream):
     binary_formatter = binary_format(size)
     for encoding_id, encoding in zip(range(count), encoding_stream):
         yield (
             str(encoding_id),
             binary_formatter.pack(encoding_id, deserialize_bytes(encoding)),
             encoding_to_block_map[str(encoding_id)]
             )
 def test_binary_pack_filters(self):
     encoding_size = 128
     filters = [(random.randint(0, 2 ** 32 - 1), generate_bytes(encoding_size)) for _ in range(10)]
     packed_filters = binary_pack_filters(filters, encoding_size)
     bin_format = binary_format(encoding_size)
     for filter, packed_filter in zip(filters, packed_filters):
         assert len(packed_filter) == encoding_size + 4
         unpacked = binary_unpack_one(packed_filter, bin_format)
         assert filter == unpacked
Esempio n. 4
0
def upload_clk_data_binary(project_id, dp_id, raw_stream, count, size=128):
    """
    Save the user provided raw CLK data.

    """
    receipt_token = generate_code()
    filename = Config.BIN_FILENAME_FMT.format(receipt_token)
    # Set the state to 'pending' in the bloomingdata table
    with DBConn() as conn:
        db.insert_encoding_metadata(conn, filename, dp_id, receipt_token,
                                    count)
        db.update_encoding_metadata_set_encoding_size(conn, dp_id, size)
    logger.info(
        f"Storing supplied binary clks of individual size {size} in file: {filename}"
    )

    num_bytes = binary_format(size).size * count

    logger.debug(
        "Directly storing binary file with index, base64 encoded CLK, popcount"
    )

    # Upload to object store
    logger.info(
        f"Uploading {count} binary encodings to object store. Total size: {fmt_bytes(num_bytes)}"
    )
    parent_span = g.flask_tracer.get_span()

    with opentracing.tracer.start_span('save-to-minio',
                                       child_of=parent_span) as span:
        mc = connect_to_object_store()
        try:
            mc.put_object(Config.MINIO_BUCKET,
                          filename,
                          data=raw_stream,
                          length=num_bytes)
        except (minio.error.InvalidSizeError, minio.error.InvalidArgumentError,
                minio.error.ResponseError):
            logger.info(
                "Mismatch between expected stream length and header info")
            raise ValueError(
                "Mismatch between expected stream length and header info")

    with opentracing.tracer.start_span('update-database',
                                       child_of=parent_span) as span:
        with DBConn() as conn:
            db.update_encoding_metadata(conn, filename, dp_id, 'ready')
            db.set_dataprovider_upload_state(conn, dp_id, True)

    # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id):
        logger.info("All parties data present. Scheduling any queued runs")
        check_for_executable_runs.delay(project_id,
                                        serialize_span(parent_span))

    return receipt_token
Esempio n. 5
0
def get_encoding_chunks(conn, package, encoding_size=128):
    """enrich the chunks in the package with the encodings and entity_ids"""
    chunks_per_dp = defaultdict(list)
    for chunk_info_1, chunk_info_2 in package:
        chunks_per_dp[chunk_info_1['dataproviderId']].append(chunk_info_1)
        chunks_per_dp[chunk_info_2['dataproviderId']].append(chunk_info_2)

    encodings_with_ids = {}
    for dp_id in chunks_per_dp:
        #get all encodings for that dp, save in dict with blockID as key.
        chunks = sorted(chunks_per_dp[dp_id],
                        key=lambda chunk: chunk['block_id'])
        block_ids = [chunk['block_id'] for chunk in chunks]
        values = get_encodings_of_multiple_blocks(conn, dp_id, block_ids)
        i = 0

        bit_packing_struct = binary_format(encoding_size)

        def block_values_iter(values):
            block_id, entity_id, encoding = next(values)
            entity_ids = [entity_id]
            encodings = [binary_unpack_one(encoding, bit_packing_struct)[1]]
            while True:
                try:
                    new_id, n_entity_id, n_encoding = next(values)
                    if new_id == block_id:
                        entity_ids.append(n_entity_id)
                        encodings.append(
                            binary_unpack_one(n_encoding,
                                              bit_packing_struct)[1])
                    else:
                        yield block_id, entity_ids, encodings
                        block_id = new_id
                        entity_ids = [n_entity_id]
                        encodings = [
                            binary_unpack_one(n_encoding,
                                              bit_packing_struct)[1]
                        ]
                except StopIteration:
                    yield block_id, entity_ids, encodings
                    break

        for block_id, entity_ids, encodings in block_values_iter(values):
            encodings_with_ids[(dp_id, block_id)] = (entity_ids, encodings)

    for chunk_info_1, chunk_info_2 in package:
        entity_ids, encodings = encodings_with_ids[(
            chunk_info_1['dataproviderId'], chunk_info_1['block_id'])]
        chunk_info_1['encodings'] = encodings
        chunk_info_1['entity_ids'] = entity_ids
        entity_ids, encodings = encodings_with_ids[(
            chunk_info_2['dataproviderId'], chunk_info_2['block_id'])]
        chunk_info_2['encodings'] = encodings
        chunk_info_2['entity_ids'] = entity_ids

    return package
Esempio n. 6
0
def include_encoding_id_in_binary_stream(stream, size, count):
    """
    Inject an encoding_id and default block into a binary stream of encodings.
    """

    binary_formatter = binary_format(size)

    def encoding_iterator(filter_stream):
        # Assumes encoding id and block info not provided (yet)
        for entity_id in range(count):
            yield str(entity_id), binary_formatter.pack(
                entity_id, filter_stream.read(size)), [DEFAULT_BLOCK_ID]

    return encoding_iterator(stream)
Esempio n. 7
0
def include_encoding_id_in_json_stream(stream, size, count):
    """
    Inject an encoding_id and default block into a ijson stream of encodings.
    :param stream: ijson object
    :param count: integer
    :return: generator
    """
    binary_formatter = binary_format(size)

    def encoding_iterator(filter_stream):
        # Assumes encoding id and block info not provided (yet)
        for entity_id, encoding in zip(range(count), filter_stream):
            yield str(entity_id), binary_formatter.pack(
                entity_id, deserialize_bytes(encoding)), [DEFAULT_BLOCK_ID]

    return encoding_iterator(stream)
Esempio n. 8
0
    def test_insert_many_clks(self):
        num_entities = 10_000
        encoding_size = 2048  # non default encoding size
        binary_formatter = binary_format(encoding_size)

        raw_data = [generate_bytes(encoding_size) for i in range(100)]
        encodings = [
            binary_formatter.pack(i, raw_data[i % 100])
            for i in range(num_entities)
        ]
        blocks = [['1'] for _ in range(num_entities)]

        project_id, project_auth_token, dp_id, dp_auth_token = self._create_project_and_dp(
        )
        conn, cur = _get_conn_and_cursor()

        insert_encodings_into_blocks(conn,
                                     dp_id,
                                     block_ids=blocks,
                                     encoding_ids=list(range(num_entities)),
                                     encodings=encodings)
        conn.commit()

        stored_encoding_ids = list(get_encodingblock_ids(conn, dp_id, '1'))

        assert len(stored_encoding_ids) == num_entities
        for stored_encoding_id, original_id in zip(stored_encoding_ids,
                                                   range(num_entities)):
            assert stored_encoding_id == original_id

        stored_encodings = list(
            get_chunk_of_encodings(conn,
                                   dp_id,
                                   stored_encoding_ids,
                                   stored_binary_size=(encoding_size + 4)))

        assert len(stored_encodings) == num_entities
        for stored_encoding, original_encoding in zip(stored_encodings,
                                                      encodings):
            assert bytes(stored_encoding) == original_encoding

        block_names, block_sizes = zip(*list(get_block_metadata(conn, dp_id)))

        assert len(block_names) == 1
        assert len(block_sizes) == 1
        assert block_names[0] == '1'
        assert block_sizes[0] == 10_000
Esempio n. 9
0
def upload_clk_data_binary(project_id,
                           dp_id,
                           encoding_iter,
                           receipt_token,
                           count,
                           size=128):
    """
    Save the user provided binary-packed CLK data.

    """
    filename = None
    # Set the state to 'pending' in the uploads table
    with DBConn() as conn:
        db.insert_encoding_metadata(conn,
                                    filename,
                                    dp_id,
                                    receipt_token,
                                    encoding_count=count,
                                    block_count=1)
        db.update_encoding_metadata_set_encoding_size(conn, dp_id, size)
    num_bytes = binary_format(size).size * count

    logger.debug(
        "Directly storing binary file with index, base64 encoded CLK, popcount"
    )

    # Upload to database
    logger.info(
        f"Uploading {count} binary encodings to database. Total size: {fmt_bytes(num_bytes)}"
    )
    parent_span = g.flask_tracer.get_span()

    with DBConn() as conn:
        db.update_encoding_metadata_set_encoding_size(conn, dp_id, size)

        with opentracing.tracer.start_span('create-default-block-in-db',
                                           child_of=parent_span):
            db.insert_blocking_metadata(conn, dp_id, {DEFAULT_BLOCK_ID: count})

        with opentracing.tracer.start_span('upload-encodings-to-db',
                                           child_of=parent_span):
            store_encodings_in_db(conn, dp_id, encoding_iter, size)

        with opentracing.tracer.start_span('update-encoding-metadata',
                                           child_of=parent_span):
            db.update_encoding_metadata(conn, filename, dp_id, 'ready')
Esempio n. 10
0
def convert_encodings_from_base64_to_binary(
        encodings: Iterator[Tuple[str, str, List[str]]]):
    """
    :param encodings: Iterable object containing tuples of  (entity_id, base64 encoding, list of blocks)
    :return: a tuple comprising:
         (entity_id, binary encoding, list of blocks)
    """
    # Peek at the first element to extract the encoding size
    i, encoding_data, blocks = next(encodings)
    encoding_size = len(encoding_data)
    bit_packing_struct = binary_format(encoding_size)

    def generator(first_i, first_encoding_data, first_blocks):
        binary_packed_encoding = bit_packing_struct.pack(
            first_i, first_encoding_data)
        yield first_i, binary_packed_encoding, first_blocks
        for i, encoding_data, blocks in encodings:
            binary_packed_encoding = bit_packing_struct.pack(i, encoding_data)
            yield i, binary_packed_encoding, blocks

    return encoding_size, generator(i, encoding_data, blocks)
Esempio n. 11
0
def handle_raw_upload(project_id, dp_id, receipt_token, parent_span=None):
    # User has uploaded base64 encodings as JSON
    log = logger.bind(pid=project_id, dp_id=dp_id)
    log.info("Handling user provided base64 encodings")

    with DBConn() as db:
        if not check_project_exists(db, project_id):
            log.info("Project deleted, stopping immediately")
            return
        expected_count = get_number_of_hashes(db, dp_id)

    log.info(f"Expecting to handle {expected_count} encodings")
    mc = connect_to_object_store()

    # Input file is line separated base64 record encodings.
    raw_file = Config.RAW_FILENAME_FMT.format(receipt_token)
    raw_data_response = mc.get_object(Config.MINIO_BUCKET, raw_file)

    # Set up streaming processing pipeline
    buffered_stream = iterable_to_stream(raw_data_response.stream())
    text_stream = io.TextIOWrapper(buffered_stream, newline='\n')

    clkcounts = []

    def filter_generator():
        log.debug("Deserializing json filters")
        first_encoding_size = None
        for i, line in enumerate(text_stream):
            ba = deserialize_bitarray(line)
            yield (ba, i, ba.count())
            clkcounts.append(ba.count())
            encsize = len(ba)
            if i == 0:
                first_encoding_size = encsize
            if encsize != first_encoding_size:
                raise ValueError("Encodings were not all the same size")

        log.info(f"Processed {len(clkcounts)} hashes")

    # We peek at the first element as we need the encoding size
    # for the ret of our processing pipeline
    python_filters = more_itertools.peekable(filter_generator())
    # Note the len of a bitarray is returned in bits but we require
    # this to be a multiple of 8 so we use bytes.
    uploaded_encoding_size = len(python_filters.peek()[0]) // 8

    # This is the first time we've seen the encoding size from this data provider
    try:
        check_dataproviders_encoding(project_id, uploaded_encoding_size)
    except InvalidEncodingError as e:
        log.warning(e.args[0])
        handle_invalid_encoding_data(project_id, dp_id)

    with DBConn() as db:
        # Save the encoding size as metadata
        update_encoding_metadata_set_encoding_size(db, dp_id,
                                                   uploaded_encoding_size)

    # Output file is our custom binary packed file
    filename = Config.BIN_FILENAME_FMT.format(receipt_token)
    bit_packed_element_size = binary_format(uploaded_encoding_size).size
    num_bytes = expected_count * bit_packed_element_size

    # If small enough preload the data into our redis cache
    if expected_count < Config.ENTITY_CACHE_THRESHOLD:
        log.info("Caching pickled clk data")
        python_filters = list(python_filters)
        cache.set_deserialized_filter(dp_id, python_filters)
    else:
        log.info("Not caching clk data as it is too large")

    packed_filters = binary_pack_filters(python_filters,
                                         uploaded_encoding_size)
    packed_filter_stream = iterable_to_stream(packed_filters)

    # Upload to object store
    log.info(
        f"Uploading {expected_count} encodings of size {uploaded_encoding_size} "
        + f"to object store. Total Size: {fmt_bytes(num_bytes)}")
    mc.put_object(Config.MINIO_BUCKET,
                  filename,
                  data=packed_filter_stream,
                  length=num_bytes)

    with DBConn() as conn:
        update_encoding_metadata(conn, filename, dp_id, 'ready')

    # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id, check_data_ready=True):
        log.info("All parties' data present. Scheduling any queued runs")
        check_for_executable_runs.delay(
            project_id, handle_raw_upload.get_serialized_span())
Esempio n. 12
0
def project_clks_post(project_id):
    """
    Update a project to provide encoded PII data.
    """
    log = logger.bind(pid=project_id)
    headers = request.headers

    parent_span = g.flask_tracer.get_span()

    with opentracing.tracer.start_span('check-auth',
                                       child_of=parent_span) as span:
        abort_if_project_doesnt_exist(project_id)
        if headers is None or 'Authorization' not in headers:
            safe_fail_request(401, message="Authentication token required")

        token = headers['Authorization']

        # Check the caller has valid token -> otherwise 403
        abort_if_invalid_dataprovider_token(token)

    with DBConn() as conn:
        dp_id = db.get_dataprovider_id(conn, token)
        project_encoding_size = db.get_project_schema_encoding_size(
            get_db(), project_id)

    log = log.bind(dp_id=dp_id)
    log.info("Receiving CLK data.")
    receipt_token = None

    with opentracing.tracer.start_span('upload-data',
                                       child_of=parent_span) as span:
        span.set_tag("project_id", project_id)
        if headers['Content-Type'] == "application/json":
            span.set_tag("content-type", 'json')
            # TODO: Previously, we were accessing the CLKs in a streaming fashion to avoid parsing the json in one hit. This
            #       enables running the web frontend with less memory.
            #       However, as connexion is very, very strict about input validation when it comes to json, it will always
            #       consume the stream first to validate it against the spec. Thus the backflip to fully reading the CLks as
            #       json into memory. -> issue #184

            receipt_token, raw_file = upload_json_clk_data(
                dp_id, get_json(), span)
            # Schedule a task to deserialize the hashes, and carry
            # out a pop count.
            handle_raw_upload.delay(project_id,
                                    dp_id,
                                    receipt_token,
                                    parent_span=serialize_span(span))
            log.info("Job scheduled to handle user uploaded hashes")
        elif headers['Content-Type'] == "application/octet-stream":
            span.set_tag("content-type", 'binary')
            log.info("Handling binary CLK upload")
            try:
                count, size = check_binary_upload_headers(headers)
                log.info(
                    f"Headers tell us to expect {count} encodings of {size} bytes"
                )
                span.log_kv({'count': count, 'size': size})
            except Exception:
                log.warning(
                    "Upload failed due to problem with headers in binary upload"
                )
                raise
            # Check against project level encoding size (if it has been set)
            if project_encoding_size is not None and size != project_encoding_size:
                # fail fast - we haven't stored the encoded data yet
                return safe_fail_request(
                    400, "Upload 'Hash-Size' doesn't match project settings")

            # TODO actually stream the upload data straight to Minio. Currently we can't because
            # connexion has already read the data before our handler is called!
            # https://github.com/zalando/connexion/issues/592
            # stream = get_stream()
            stream = BytesIO(request.data)
            expected_bytes = binary_format(size).size * count
            log.debug(
                f"Stream size is {len(request.data)} B, and we expect {expected_bytes} B"
            )
            if len(request.data) != expected_bytes:
                safe_fail_request(
                    400,
                    "Uploaded data did not match the expected size. Check request headers are correct"
                )
            try:
                receipt_token = upload_clk_data_binary(project_id, dp_id,
                                                       stream, count, size)
            except ValueError:
                safe_fail_request(
                    400,
                    "Uploaded data did not match the expected size. Check request headers are correct."
                )
        else:
            safe_fail_request(400, "Content Type not supported")

    return {'message': 'Updated', 'receipt_token': receipt_token}, 201
Esempio n. 13
0
def project_clks_post(project_id):
    """
    Update a project to provide encoded PII data.
    """

    headers = request.headers

    log, parent_span = bind_log_and_span(project_id)
    log.debug("Starting data upload request")
    token = precheck_upload_token(project_id, headers, parent_span)
    receipt_token = generate_code()
    with DBConn() as conn:
        dp_id = db.get_dataprovider_id(conn, token)
        project_encoding_size = db.get_project_schema_encoding_size(
            conn, project_id)
        upload_state_updated = db.is_dataprovider_allowed_to_upload_and_lock(
            conn, dp_id)
        # get flag use_blocking from table projects
        uses_blocking = get_project_column(conn, project_id, 'uses_blocking')

    if not upload_state_updated:
        return safe_fail_request(
            403, "This token has already been used to upload clks.")

    log = log.bind(dp_id=dp_id)
    log.info("Receiving CLK data.")

    with opentracing.tracer.start_span('upload-clk-data',
                                       child_of=parent_span) as span:
        span.set_tag("project_id", project_id)
        try:
            if headers['Content-Type'] == "application/json":
                span.set_tag("content-type", 'json')
                # TODO: Previously, we were accessing the CLKs in a streaming fashion to avoid parsing the json in one hit. This
                #       enables running the web frontend with less memory.
                #       However, as connexion is very, very strict about input validation when it comes to json, it will always
                #       consume the stream first to validate it against the spec. Thus the backflip to fully reading the CLks as
                #       json into memory. -> issue #184
                handle_encoding_upload_json(project_id,
                                            dp_id,
                                            get_json(),
                                            receipt_token,
                                            uses_blocking,
                                            parent_span=span)

                log.info("Job scheduled to handle users upload")
            elif headers['Content-Type'] == "application/octet-stream":
                span.set_tag("content-type", 'binary')
                log.info("Handling binary CLK upload")
                try:
                    count, size = check_binary_upload_headers(headers)
                    log.info(
                        f"Headers tell us to expect {count} encodings of {size} bytes"
                    )
                    span.log_kv({'count': count, 'size': size})
                except Exception:
                    log.warning(
                        "Upload failed due to problem with headers in binary upload"
                    )
                    raise
                # Check against project level encoding size (if it has been set)
                if project_encoding_size is not None and size != project_encoding_size:
                    # fail fast - we haven't stored the encoded data yet
                    return safe_fail_request(
                        400,
                        "Upload 'Hash-Size' doesn't match project settings")

                # TODO actually stream the upload data straight to Minio. Currently we can't because
                # connexion has already read the data before our handler is called!
                # https://github.com/zalando/connexion/issues/592
                # stream = get_stream()
                stream = BytesIO(request.data)
                expected_bytes = binary_format(size).size * count
                log.debug(
                    f"Stream size is {len(request.data)} B, and we expect {expected_bytes} B"
                )
                if len(request.data) != expected_bytes:
                    safe_fail_request(
                        400,
                        "Uploaded data did not match the expected size. Check request headers are correct"
                    )
                try:
                    upload_clk_data_binary(project_id, dp_id, stream,
                                           receipt_token, count, size)
                except ValueError:
                    safe_fail_request(
                        400,
                        "Uploaded data did not match the expected size. Check request headers are correct."
                    )
            else:
                safe_fail_request(400, "Content Type not supported")
        except ProblemException as e:
            # Have an exception that is safe for the user. We reset the upload state to
            # allow the user to try upload again.
            log.info(
                f"Problem occurred, returning status={e.status} - {e.detail}")
            with DBConn() as conn:
                db.set_dataprovider_upload_state(conn,
                                                 dp_id,
                                                 state='not_started')
            raise
        except Exception as e:
            log.warning("Unhandled error occurred during data upload")
            log.exception(e)
            with DBConn() as conn:
                db.set_dataprovider_upload_state(conn, dp_id, state='error')
            safe_fail_request(
                500, "Sorry, the server couldn't handle that request")

    with DBConn() as conn:
        db.set_dataprovider_upload_state(conn, dp_id, state='done')

    # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id):
        logger.info("All parties data present. Scheduling any queued runs")
        check_for_executable_runs.delay(project_id,
                                        serialize_span(parent_span))

    return {'message': 'Updated', 'receipt_token': receipt_token}, 201