def encoding_iterator(encoding_stream): binary_formatter = binary_format(size) for encoding_id in range(count): yield ( str(encoding_id), binary_formatter.pack(encoding_id, encoding_stream.read(size)), encoding_to_block_map[str(encoding_id)] )
def ijson_encoding_iterator(encoding_stream): binary_formatter = binary_format(size) for encoding_id, encoding in zip(range(count), encoding_stream): yield ( str(encoding_id), binary_formatter.pack(encoding_id, deserialize_bytes(encoding)), encoding_to_block_map[str(encoding_id)] )
def test_binary_pack_filters(self): encoding_size = 128 filters = [(random.randint(0, 2 ** 32 - 1), generate_bytes(encoding_size)) for _ in range(10)] packed_filters = binary_pack_filters(filters, encoding_size) bin_format = binary_format(encoding_size) for filter, packed_filter in zip(filters, packed_filters): assert len(packed_filter) == encoding_size + 4 unpacked = binary_unpack_one(packed_filter, bin_format) assert filter == unpacked
def upload_clk_data_binary(project_id, dp_id, raw_stream, count, size=128): """ Save the user provided raw CLK data. """ receipt_token = generate_code() filename = Config.BIN_FILENAME_FMT.format(receipt_token) # Set the state to 'pending' in the bloomingdata table with DBConn() as conn: db.insert_encoding_metadata(conn, filename, dp_id, receipt_token, count) db.update_encoding_metadata_set_encoding_size(conn, dp_id, size) logger.info( f"Storing supplied binary clks of individual size {size} in file: {filename}" ) num_bytes = binary_format(size).size * count logger.debug( "Directly storing binary file with index, base64 encoded CLK, popcount" ) # Upload to object store logger.info( f"Uploading {count} binary encodings to object store. Total size: {fmt_bytes(num_bytes)}" ) parent_span = g.flask_tracer.get_span() with opentracing.tracer.start_span('save-to-minio', child_of=parent_span) as span: mc = connect_to_object_store() try: mc.put_object(Config.MINIO_BUCKET, filename, data=raw_stream, length=num_bytes) except (minio.error.InvalidSizeError, minio.error.InvalidArgumentError, minio.error.ResponseError): logger.info( "Mismatch between expected stream length and header info") raise ValueError( "Mismatch between expected stream length and header info") with opentracing.tracer.start_span('update-database', child_of=parent_span) as span: with DBConn() as conn: db.update_encoding_metadata(conn, filename, dp_id, 'ready') db.set_dataprovider_upload_state(conn, dp_id, True) # Now work out if all parties have added their data if clks_uploaded_to_project(project_id): logger.info("All parties data present. Scheduling any queued runs") check_for_executable_runs.delay(project_id, serialize_span(parent_span)) return receipt_token
def get_encoding_chunks(conn, package, encoding_size=128): """enrich the chunks in the package with the encodings and entity_ids""" chunks_per_dp = defaultdict(list) for chunk_info_1, chunk_info_2 in package: chunks_per_dp[chunk_info_1['dataproviderId']].append(chunk_info_1) chunks_per_dp[chunk_info_2['dataproviderId']].append(chunk_info_2) encodings_with_ids = {} for dp_id in chunks_per_dp: #get all encodings for that dp, save in dict with blockID as key. chunks = sorted(chunks_per_dp[dp_id], key=lambda chunk: chunk['block_id']) block_ids = [chunk['block_id'] for chunk in chunks] values = get_encodings_of_multiple_blocks(conn, dp_id, block_ids) i = 0 bit_packing_struct = binary_format(encoding_size) def block_values_iter(values): block_id, entity_id, encoding = next(values) entity_ids = [entity_id] encodings = [binary_unpack_one(encoding, bit_packing_struct)[1]] while True: try: new_id, n_entity_id, n_encoding = next(values) if new_id == block_id: entity_ids.append(n_entity_id) encodings.append( binary_unpack_one(n_encoding, bit_packing_struct)[1]) else: yield block_id, entity_ids, encodings block_id = new_id entity_ids = [n_entity_id] encodings = [ binary_unpack_one(n_encoding, bit_packing_struct)[1] ] except StopIteration: yield block_id, entity_ids, encodings break for block_id, entity_ids, encodings in block_values_iter(values): encodings_with_ids[(dp_id, block_id)] = (entity_ids, encodings) for chunk_info_1, chunk_info_2 in package: entity_ids, encodings = encodings_with_ids[( chunk_info_1['dataproviderId'], chunk_info_1['block_id'])] chunk_info_1['encodings'] = encodings chunk_info_1['entity_ids'] = entity_ids entity_ids, encodings = encodings_with_ids[( chunk_info_2['dataproviderId'], chunk_info_2['block_id'])] chunk_info_2['encodings'] = encodings chunk_info_2['entity_ids'] = entity_ids return package
def include_encoding_id_in_binary_stream(stream, size, count): """ Inject an encoding_id and default block into a binary stream of encodings. """ binary_formatter = binary_format(size) def encoding_iterator(filter_stream): # Assumes encoding id and block info not provided (yet) for entity_id in range(count): yield str(entity_id), binary_formatter.pack( entity_id, filter_stream.read(size)), [DEFAULT_BLOCK_ID] return encoding_iterator(stream)
def include_encoding_id_in_json_stream(stream, size, count): """ Inject an encoding_id and default block into a ijson stream of encodings. :param stream: ijson object :param count: integer :return: generator """ binary_formatter = binary_format(size) def encoding_iterator(filter_stream): # Assumes encoding id and block info not provided (yet) for entity_id, encoding in zip(range(count), filter_stream): yield str(entity_id), binary_formatter.pack( entity_id, deserialize_bytes(encoding)), [DEFAULT_BLOCK_ID] return encoding_iterator(stream)
def test_insert_many_clks(self): num_entities = 10_000 encoding_size = 2048 # non default encoding size binary_formatter = binary_format(encoding_size) raw_data = [generate_bytes(encoding_size) for i in range(100)] encodings = [ binary_formatter.pack(i, raw_data[i % 100]) for i in range(num_entities) ] blocks = [['1'] for _ in range(num_entities)] project_id, project_auth_token, dp_id, dp_auth_token = self._create_project_and_dp( ) conn, cur = _get_conn_and_cursor() insert_encodings_into_blocks(conn, dp_id, block_ids=blocks, encoding_ids=list(range(num_entities)), encodings=encodings) conn.commit() stored_encoding_ids = list(get_encodingblock_ids(conn, dp_id, '1')) assert len(stored_encoding_ids) == num_entities for stored_encoding_id, original_id in zip(stored_encoding_ids, range(num_entities)): assert stored_encoding_id == original_id stored_encodings = list( get_chunk_of_encodings(conn, dp_id, stored_encoding_ids, stored_binary_size=(encoding_size + 4))) assert len(stored_encodings) == num_entities for stored_encoding, original_encoding in zip(stored_encodings, encodings): assert bytes(stored_encoding) == original_encoding block_names, block_sizes = zip(*list(get_block_metadata(conn, dp_id))) assert len(block_names) == 1 assert len(block_sizes) == 1 assert block_names[0] == '1' assert block_sizes[0] == 10_000
def upload_clk_data_binary(project_id, dp_id, encoding_iter, receipt_token, count, size=128): """ Save the user provided binary-packed CLK data. """ filename = None # Set the state to 'pending' in the uploads table with DBConn() as conn: db.insert_encoding_metadata(conn, filename, dp_id, receipt_token, encoding_count=count, block_count=1) db.update_encoding_metadata_set_encoding_size(conn, dp_id, size) num_bytes = binary_format(size).size * count logger.debug( "Directly storing binary file with index, base64 encoded CLK, popcount" ) # Upload to database logger.info( f"Uploading {count} binary encodings to database. Total size: {fmt_bytes(num_bytes)}" ) parent_span = g.flask_tracer.get_span() with DBConn() as conn: db.update_encoding_metadata_set_encoding_size(conn, dp_id, size) with opentracing.tracer.start_span('create-default-block-in-db', child_of=parent_span): db.insert_blocking_metadata(conn, dp_id, {DEFAULT_BLOCK_ID: count}) with opentracing.tracer.start_span('upload-encodings-to-db', child_of=parent_span): store_encodings_in_db(conn, dp_id, encoding_iter, size) with opentracing.tracer.start_span('update-encoding-metadata', child_of=parent_span): db.update_encoding_metadata(conn, filename, dp_id, 'ready')
def convert_encodings_from_base64_to_binary( encodings: Iterator[Tuple[str, str, List[str]]]): """ :param encodings: Iterable object containing tuples of (entity_id, base64 encoding, list of blocks) :return: a tuple comprising: (entity_id, binary encoding, list of blocks) """ # Peek at the first element to extract the encoding size i, encoding_data, blocks = next(encodings) encoding_size = len(encoding_data) bit_packing_struct = binary_format(encoding_size) def generator(first_i, first_encoding_data, first_blocks): binary_packed_encoding = bit_packing_struct.pack( first_i, first_encoding_data) yield first_i, binary_packed_encoding, first_blocks for i, encoding_data, blocks in encodings: binary_packed_encoding = bit_packing_struct.pack(i, encoding_data) yield i, binary_packed_encoding, blocks return encoding_size, generator(i, encoding_data, blocks)
def handle_raw_upload(project_id, dp_id, receipt_token, parent_span=None): # User has uploaded base64 encodings as JSON log = logger.bind(pid=project_id, dp_id=dp_id) log.info("Handling user provided base64 encodings") with DBConn() as db: if not check_project_exists(db, project_id): log.info("Project deleted, stopping immediately") return expected_count = get_number_of_hashes(db, dp_id) log.info(f"Expecting to handle {expected_count} encodings") mc = connect_to_object_store() # Input file is line separated base64 record encodings. raw_file = Config.RAW_FILENAME_FMT.format(receipt_token) raw_data_response = mc.get_object(Config.MINIO_BUCKET, raw_file) # Set up streaming processing pipeline buffered_stream = iterable_to_stream(raw_data_response.stream()) text_stream = io.TextIOWrapper(buffered_stream, newline='\n') clkcounts = [] def filter_generator(): log.debug("Deserializing json filters") first_encoding_size = None for i, line in enumerate(text_stream): ba = deserialize_bitarray(line) yield (ba, i, ba.count()) clkcounts.append(ba.count()) encsize = len(ba) if i == 0: first_encoding_size = encsize if encsize != first_encoding_size: raise ValueError("Encodings were not all the same size") log.info(f"Processed {len(clkcounts)} hashes") # We peek at the first element as we need the encoding size # for the ret of our processing pipeline python_filters = more_itertools.peekable(filter_generator()) # Note the len of a bitarray is returned in bits but we require # this to be a multiple of 8 so we use bytes. uploaded_encoding_size = len(python_filters.peek()[0]) // 8 # This is the first time we've seen the encoding size from this data provider try: check_dataproviders_encoding(project_id, uploaded_encoding_size) except InvalidEncodingError as e: log.warning(e.args[0]) handle_invalid_encoding_data(project_id, dp_id) with DBConn() as db: # Save the encoding size as metadata update_encoding_metadata_set_encoding_size(db, dp_id, uploaded_encoding_size) # Output file is our custom binary packed file filename = Config.BIN_FILENAME_FMT.format(receipt_token) bit_packed_element_size = binary_format(uploaded_encoding_size).size num_bytes = expected_count * bit_packed_element_size # If small enough preload the data into our redis cache if expected_count < Config.ENTITY_CACHE_THRESHOLD: log.info("Caching pickled clk data") python_filters = list(python_filters) cache.set_deserialized_filter(dp_id, python_filters) else: log.info("Not caching clk data as it is too large") packed_filters = binary_pack_filters(python_filters, uploaded_encoding_size) packed_filter_stream = iterable_to_stream(packed_filters) # Upload to object store log.info( f"Uploading {expected_count} encodings of size {uploaded_encoding_size} " + f"to object store. Total Size: {fmt_bytes(num_bytes)}") mc.put_object(Config.MINIO_BUCKET, filename, data=packed_filter_stream, length=num_bytes) with DBConn() as conn: update_encoding_metadata(conn, filename, dp_id, 'ready') # Now work out if all parties have added their data if clks_uploaded_to_project(project_id, check_data_ready=True): log.info("All parties' data present. Scheduling any queued runs") check_for_executable_runs.delay( project_id, handle_raw_upload.get_serialized_span())
def project_clks_post(project_id): """ Update a project to provide encoded PII data. """ log = logger.bind(pid=project_id) headers = request.headers parent_span = g.flask_tracer.get_span() with opentracing.tracer.start_span('check-auth', child_of=parent_span) as span: abort_if_project_doesnt_exist(project_id) if headers is None or 'Authorization' not in headers: safe_fail_request(401, message="Authentication token required") token = headers['Authorization'] # Check the caller has valid token -> otherwise 403 abort_if_invalid_dataprovider_token(token) with DBConn() as conn: dp_id = db.get_dataprovider_id(conn, token) project_encoding_size = db.get_project_schema_encoding_size( get_db(), project_id) log = log.bind(dp_id=dp_id) log.info("Receiving CLK data.") receipt_token = None with opentracing.tracer.start_span('upload-data', child_of=parent_span) as span: span.set_tag("project_id", project_id) if headers['Content-Type'] == "application/json": span.set_tag("content-type", 'json') # TODO: Previously, we were accessing the CLKs in a streaming fashion to avoid parsing the json in one hit. This # enables running the web frontend with less memory. # However, as connexion is very, very strict about input validation when it comes to json, it will always # consume the stream first to validate it against the spec. Thus the backflip to fully reading the CLks as # json into memory. -> issue #184 receipt_token, raw_file = upload_json_clk_data( dp_id, get_json(), span) # Schedule a task to deserialize the hashes, and carry # out a pop count. handle_raw_upload.delay(project_id, dp_id, receipt_token, parent_span=serialize_span(span)) log.info("Job scheduled to handle user uploaded hashes") elif headers['Content-Type'] == "application/octet-stream": span.set_tag("content-type", 'binary') log.info("Handling binary CLK upload") try: count, size = check_binary_upload_headers(headers) log.info( f"Headers tell us to expect {count} encodings of {size} bytes" ) span.log_kv({'count': count, 'size': size}) except Exception: log.warning( "Upload failed due to problem with headers in binary upload" ) raise # Check against project level encoding size (if it has been set) if project_encoding_size is not None and size != project_encoding_size: # fail fast - we haven't stored the encoded data yet return safe_fail_request( 400, "Upload 'Hash-Size' doesn't match project settings") # TODO actually stream the upload data straight to Minio. Currently we can't because # connexion has already read the data before our handler is called! # https://github.com/zalando/connexion/issues/592 # stream = get_stream() stream = BytesIO(request.data) expected_bytes = binary_format(size).size * count log.debug( f"Stream size is {len(request.data)} B, and we expect {expected_bytes} B" ) if len(request.data) != expected_bytes: safe_fail_request( 400, "Uploaded data did not match the expected size. Check request headers are correct" ) try: receipt_token = upload_clk_data_binary(project_id, dp_id, stream, count, size) except ValueError: safe_fail_request( 400, "Uploaded data did not match the expected size. Check request headers are correct." ) else: safe_fail_request(400, "Content Type not supported") return {'message': 'Updated', 'receipt_token': receipt_token}, 201
def project_clks_post(project_id): """ Update a project to provide encoded PII data. """ headers = request.headers log, parent_span = bind_log_and_span(project_id) log.debug("Starting data upload request") token = precheck_upload_token(project_id, headers, parent_span) receipt_token = generate_code() with DBConn() as conn: dp_id = db.get_dataprovider_id(conn, token) project_encoding_size = db.get_project_schema_encoding_size( conn, project_id) upload_state_updated = db.is_dataprovider_allowed_to_upload_and_lock( conn, dp_id) # get flag use_blocking from table projects uses_blocking = get_project_column(conn, project_id, 'uses_blocking') if not upload_state_updated: return safe_fail_request( 403, "This token has already been used to upload clks.") log = log.bind(dp_id=dp_id) log.info("Receiving CLK data.") with opentracing.tracer.start_span('upload-clk-data', child_of=parent_span) as span: span.set_tag("project_id", project_id) try: if headers['Content-Type'] == "application/json": span.set_tag("content-type", 'json') # TODO: Previously, we were accessing the CLKs in a streaming fashion to avoid parsing the json in one hit. This # enables running the web frontend with less memory. # However, as connexion is very, very strict about input validation when it comes to json, it will always # consume the stream first to validate it against the spec. Thus the backflip to fully reading the CLks as # json into memory. -> issue #184 handle_encoding_upload_json(project_id, dp_id, get_json(), receipt_token, uses_blocking, parent_span=span) log.info("Job scheduled to handle users upload") elif headers['Content-Type'] == "application/octet-stream": span.set_tag("content-type", 'binary') log.info("Handling binary CLK upload") try: count, size = check_binary_upload_headers(headers) log.info( f"Headers tell us to expect {count} encodings of {size} bytes" ) span.log_kv({'count': count, 'size': size}) except Exception: log.warning( "Upload failed due to problem with headers in binary upload" ) raise # Check against project level encoding size (if it has been set) if project_encoding_size is not None and size != project_encoding_size: # fail fast - we haven't stored the encoded data yet return safe_fail_request( 400, "Upload 'Hash-Size' doesn't match project settings") # TODO actually stream the upload data straight to Minio. Currently we can't because # connexion has already read the data before our handler is called! # https://github.com/zalando/connexion/issues/592 # stream = get_stream() stream = BytesIO(request.data) expected_bytes = binary_format(size).size * count log.debug( f"Stream size is {len(request.data)} B, and we expect {expected_bytes} B" ) if len(request.data) != expected_bytes: safe_fail_request( 400, "Uploaded data did not match the expected size. Check request headers are correct" ) try: upload_clk_data_binary(project_id, dp_id, stream, receipt_token, count, size) except ValueError: safe_fail_request( 400, "Uploaded data did not match the expected size. Check request headers are correct." ) else: safe_fail_request(400, "Content Type not supported") except ProblemException as e: # Have an exception that is safe for the user. We reset the upload state to # allow the user to try upload again. log.info( f"Problem occurred, returning status={e.status} - {e.detail}") with DBConn() as conn: db.set_dataprovider_upload_state(conn, dp_id, state='not_started') raise except Exception as e: log.warning("Unhandled error occurred during data upload") log.exception(e) with DBConn() as conn: db.set_dataprovider_upload_state(conn, dp_id, state='error') safe_fail_request( 500, "Sorry, the server couldn't handle that request") with DBConn() as conn: db.set_dataprovider_upload_state(conn, dp_id, state='done') # Now work out if all parties have added their data if clks_uploaded_to_project(project_id): logger.info("All parties data present. Scheduling any queued runs") check_for_executable_runs.delay(project_id, serialize_span(parent_span)) return {'message': 'Updated', 'receipt_token': receipt_token}, 201