def __init__(self, result_type, schema, name, notes, parties, uses_blocking): logger.debug("Creating project codes") self.result_type = result_type self.schema = schema self.name = name self.notes = notes self.number_parties = parties self.uses_blocking = uses_blocking self.project_id = generate_code() logger.debug("Generated project code", pid=self.project_id) self.result_token = generate_code() # Order is important here self.update_tokens = [generate_code() for _ in range(parties)]
def upload_clk_data_binary(project_id, dp_id, raw_stream, count, size=128): """ Save the user provided raw CLK data. """ receipt_token = generate_code() filename = Config.BIN_FILENAME_FMT.format(receipt_token) # Set the state to 'pending' in the bloomingdata table with DBConn() as conn: db.insert_encoding_metadata(conn, filename, dp_id, receipt_token, count) db.update_encoding_metadata_set_encoding_size(conn, dp_id, size) logger.info( f"Storing supplied binary clks of individual size {size} in file: {filename}" ) num_bytes = count * (size + 6) logger.debug( "Directly storing binary file with index, base64 encoded CLK, popcount" ) # Upload to object store logger.info( f"Uploading {count} binary encodings to object store. Total size: {fmt_bytes(num_bytes)}" ) parent_span = g.flask_tracer.get_span() with opentracing.tracer.start_span('save-to-minio', child_of=parent_span) as span: mc = connect_to_object_store() try: mc.put_object(Config.MINIO_BUCKET, filename, data=raw_stream, length=num_bytes) except (minio.error.InvalidSizeError, minio.error.InvalidArgumentError, minio.error.ResponseError): logger.info( "Mismatch between expected stream length and header info") raise ValueError( "Mismatch between expected stream length and header info") with opentracing.tracer.start_span('update-database', child_of=parent_span) as span: with DBConn() as conn: db.update_encoding_metadata(conn, filename, dp_id, 'ready') db.set_dataprovider_upload_state(conn, dp_id, True) # Now work out if all parties have added their data if clks_uploaded_to_project(project_id): logger.info("All parties data present. Scheduling any queued runs") check_for_executable_runs.delay(project_id, serialize_span(parent_span)) return receipt_token
def __init__(self, result_type, schema, name, notes, parties): logger.debug("Creating project codes") self.result_type = result_type self.schema = schema self.name = name self.notes = notes self.number_parties = parties self.project_id = generate_code() logger.debug("Generated project code", pid=self.project_id) self.result_token = generate_code() # Order is important here self.update_tokens = [generate_code() for _ in range(parties)] # TODO DELETE? self.ready = False self.status = 'not ready' self.data = {} self.result = {}
def __init__(self, project_id, threshold, name, notes): self.project_id = project_id self.name = name self.notes = notes self.threshold = threshold self.run_id = generate_code() logger.info("Created run id", rid=self.run_id) self.type = 'no_mapping' \ if db.get_project_column(db.get_db(), project_id, 'result_type') == 'similarity_scores' \ else 'default'
def upload_json_clk_data(dp_id, clk_json, parent_span): """ Convert user provided encodings from json array of base64 data into a newline separated file of base64 data. Note this implementation is non-streaming. """ if 'clks' not in clk_json or len(clk_json['clks']) < 1: safe_fail_request(400, message="Missing CLKs information") receipt_token = generate_code() filename = Config.RAW_FILENAME_FMT.format(receipt_token) logger.info("Storing user {} supplied clks from json".format(dp_id)) with opentracing.tracer.start_span('clk-splitting', child_of=parent_span) as span: count = len(clk_json['clks']) span.set_tag("clks", count) data = b''.join(''.join(clk.split('\n')).encode() + b'\n' for clk in clk_json['clks']) num_bytes = len(data) span.set_tag("num_bytes", num_bytes) buffer = BytesIO(data) logger.info( f"Received {count} encodings. Uploading {fmt_bytes(num_bytes)} to object store" ) with opentracing.tracer.start_span('save-to-quarantine', child_of=parent_span) as span: span.set_tag('filename', filename) mc = connect_to_object_store() mc.put_object(Config.MINIO_BUCKET, filename, data=buffer, length=num_bytes) with opentracing.tracer.start_span('update-db', child_of=parent_span) as span: with DBConn() as conn: db.insert_encoding_metadata(conn, filename, dp_id, receipt_token, count) return receipt_token, filename
def _put_placeholder_empty_file(mc, log): sims = array.array('d') dset_is0 = array.array('I') dset_is1 = array.array('I') rec_is0 = array.array('I') rec_is1 = array.array('I') candidate_pairs = sims, (dset_is0, dset_is1), (rec_is0, rec_is1) empty_file_iter, empty_file_size \ = anonlink.serialization.dump_candidate_pairs_iter(candidate_pairs) empty_file_name = Config.SIMILARITY_SCORES_FILENAME_FMT.format( generate_code(12)) empty_file_stream = iterable_to_stream(empty_file_iter) try: mc.put_object(Config.MINIO_BUCKET, empty_file_name, empty_file_stream, empty_file_size) except minio.ResponseError: log.warning("Failed to store empty result in minio.") raise return 0, empty_file_size, empty_file_name
def _merge_files(mc, log, file0, file1): num0, filesize0, filename0 = file0 num1, filesize1, filename1 = file1 total_num = num0 + num1 file0_stream = mc.get_object(Config.MINIO_BUCKET, filename0) file1_stream = mc.get_object(Config.MINIO_BUCKET, filename1) merged_file_iter, merged_file_size \ = anonlink.serialization.merge_streams_iter( (file0_stream, file1_stream), sizes=(filesize0, filesize1)) merged_file_name = Config.SIMILARITY_SCORES_FILENAME_FMT.format( generate_code(12)) merged_file_stream = iterable_to_stream(merged_file_iter) try: mc.put_object(Config.MINIO_BUCKET, merged_file_name, merged_file_stream, merged_file_size) except minio.ResponseError: log.warning("Failed to store merged result in minio.") raise for del_err in mc.remove_objects(Config.MINIO_BUCKET, (filename0, filename1)): log.warning(f"Failed to delete result file " f"{del_err.object_name}. {del_err}") return total_num, merged_file_size, merged_file_name
def compute_filter_similarity(package, project_id, run_id, threshold, encoding_size, parent_span=None): """Compute filter similarity between a chunk of filters in dataprovider 1, and a chunk of filters in dataprovider 2. :param dict chunk_info: A chunk returned by ``anonlink.concurrency.split_to_chunks``. :param project_id: :param run_id: :param threshold: :param encoding_size: The size in bytes of each encoded entry :param parent_span: A serialized opentracing span context. :returns A 3-tuple: (num_results, result size in bytes, results_filename_in_object_store, ) """ log = logger.bind(pid=project_id, run_id=run_id) task_span = compute_filter_similarity.span def new_child_span(name, parent_scope=None): if parent_scope is None: parent_scope = compute_filter_similarity return compute_filter_similarity.tracer.start_active_span( name, child_of=parent_scope.span) log.debug(f"Computing similarities for {len(package)} chunks of filters") log.debug( "Checking that the resource exists (in case of run being canceled/deleted)" ) assert_valid_run(project_id, run_id, log) #chunk_info_dp1, chunk_info_dp2 = chunk_info def reindex_using_encoding_ids(recordarray, encoding_id_list): # Map results from "index in chunk" to encoding id. return array.array('I', [encoding_id_list[i] for i in recordarray]) num_results = 0 num_comparisons = 0 sim_results = [] with DBConn() as conn: if len(package) > 1: # multiple full blocks in one package with new_child_span( f'fetching-encodings of package of size {len(package)}'): package = get_encoding_chunks(conn, package, encoding_size=encoding_size) else: # this chunk is all part of one block with new_child_span(f'fetching-encodings of package with 1 chunk'): chunk_info_dp1, chunk_info_dp2 = package[0] chunk_with_ids_dp1, chunk_dp1_size = get_encoding_chunk( conn, chunk_info_dp1, encoding_size) entity_ids_dp1, chunk_dp1 = zip(*chunk_with_ids_dp1) chunk_info_dp1['encodings'] = chunk_dp1 chunk_info_dp1['entity_ids'] = entity_ids_dp1 chunk_with_ids_dp2, chunk_dp2_size = get_encoding_chunk( conn, chunk_info_dp2, encoding_size) entity_ids_dp2, chunk_dp2 = zip(*chunk_with_ids_dp2) chunk_info_dp2['encodings'] = chunk_dp2 chunk_info_dp2['entity_ids'] = entity_ids_dp2 log.debug('All encodings for package are fetched and deserialized') log.debug("Calculating filter similarities for work package") with new_child_span('comparing-encodings') as parent_scope: for chunk_dp1, chunk_dp2 in package: enc_dp1 = chunk_dp1['encodings'] enc_dp1_size = len(enc_dp1) enc_dp2 = chunk_dp2['encodings'] enc_dp2_size = len(enc_dp2) assert enc_dp1_size > 0, "Zero sized chunk in dp1" assert enc_dp2_size > 0, "Zero sized chunk in dp2" try: sims, (rec_is0, rec_is1 ) = anonlink.similarities.dice_coefficient_accelerated( datasets=(enc_dp1, enc_dp2), threshold=threshold, k=min(enc_dp1_size, enc_dp2_size)) except NotImplementedError as e: log.warning( f"Encodings couldn't be compared using anonlink. {e}") return rec_is0 = reindex_using_encoding_ids(rec_is0, chunk_dp1['entity_ids']) rec_is1 = reindex_using_encoding_ids(rec_is1, chunk_dp2['entity_ids']) num_results += len(sims) num_comparisons += enc_dp1_size * enc_dp2_size sim_results.append( (sims, (rec_is0, rec_is1), chunk_dp1['datasetIndex'], chunk_dp2['datasetIndex'])) log.debug( f'comparison is done. {num_comparisons} comparisons got {num_results} pairs above the threshold' ) ##### progess reporting log.debug('Encoding similarities calculated') with new_child_span('update-comparison-progress') as scope: # Update the number of comparisons completed save_current_progress(num_comparisons, run_id) scope.span.log_kv({ 'comparisons': num_comparisons, 'num_similar': num_results }) log.debug("Comparisons: {}, Links above threshold: {}".format( num_comparisons, num_results)) ###### results into file into minio with new_child_span('save-comparison-results-to-minio'): file_iters = [] file_sizes = [] for sims, (rec_is0, rec_is1), dp1_ds_idx, dp2_ds_idx in sim_results: num_sims = len(sims) if num_sims: # Make index arrays for serialization index_1 = array.array('I', (dp1_ds_idx, )) * num_sims index_2 = array.array('I', (dp2_ds_idx, )) * num_sims chunk_results = sims, (index_1, index_2), (rec_is0, rec_is1), bytes_iter, file_size \ = anonlink.serialization.dump_candidate_pairs_iter(chunk_results) file_iters.append(iterable_to_stream(bytes_iter)) file_sizes.append(file_size) if len(file_iters) > 1: # we need to merge them first into one ordered thingy merged_file_iter, merged_file_size \ = anonlink.serialization.merge_streams_iter(file_iters, sizes=file_sizes) merged_file_iter = iterable_to_stream(merged_file_iter) elif len(file_iters) == 1: merged_file_iter = file_iters[0] merged_file_size = file_sizes[0] else: return 0, None, None result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format( generate_code(12)) task_span.log_kv({"edges": num_results}) log.info("Writing {} intermediate results to file: {}".format( num_results, result_filename)) mc = connect_to_object_store() try: mc.put_object(Config.MINIO_BUCKET, result_filename, merged_file_iter, merged_file_size) except minio.ResponseError as err: log.warning("Failed to store result in minio: {}".format(err)) raise return num_results, merged_file_size, result_filename
def test_insert_dp_no_project_fails(self): conn, cur = _get_conn_and_cursor() project_id = generate_code() dp_auth = generate_code() with raises(psycopg2.errors.ForeignKeyViolation): insert_dataprovider(cur, auth_token=dp_auth, project_id=project_id)
def compute_filter_similarity(chunk_info, project_id, run_id, threshold, encoding_size, parent_span=None): """Compute filter similarity between a chunk of filters in dataprovider 1, and a chunk of filters in dataprovider 2. :param chunk_info: Chunk info returned by ``anonlink.concurrency.split_to_chunks``. Additionally, "storeFilename" is added to each dataset chunk. :param project_id: :param threshold: :param encoding_size: The size in bytes of each encoded entry :param parent_span: A serialized opentracing span context. @returns A 2-tuple: (num_results, results_filename_in_object_store) """ log = logger.bind(pid=project_id, run_id=run_id) log.debug("Computing similarity for a chunk of filters") span = compute_filter_similarity.span log.debug( "Checking that the resource exists (in case of job being canceled)") with DBConn() as db: if not check_project_exists(db, project_id) or not check_run_exists( db, project_id, run_id): log.info("Failing task as project or run not found in database.") raise DBResourceMissing("project or run not found in database") chunk_info_dp1, chunk_info_dp2 = chunk_info t0 = time.time() log.debug("Fetching and deserializing chunk of filters for dataprovider 1") chunk_dp1, chunk_dp1_size = get_chunk_from_object_store( chunk_info_dp1, encoding_size) t1 = time.time() log.debug("Fetching and deserializing chunk of filters for dataprovider 2") chunk_dp2, chunk_dp2_size = get_chunk_from_object_store( chunk_info_dp2, encoding_size) t2 = time.time() span.log_kv({'event': 'chunks are fetched and deserialized'}) log.debug("Calculating filter similarity") span.log_kv({'size1': chunk_dp1_size, 'size2': chunk_dp2_size}) chunk_results = anonlink.concurrency.process_chunk( chunk_info, (chunk_dp1, chunk_dp2), anonlink.similarities.dice_coefficient_accelerated, threshold, k=min(chunk_dp1_size, chunk_dp2_size)) t3 = time.time() span.log_kv({'event': 'similarities calculated'}) # Update the number of comparisons completed comparisons_computed = chunk_dp1_size * chunk_dp2_size save_current_progress(comparisons_computed, run_id) t4 = time.time() sims, _, _ = chunk_results num_results = len(sims) if num_results: result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format( generate_code(12)) log.info("Writing {} intermediate results to file: {}".format( num_results, result_filename)) bytes_iter, file_size \ = anonlink.serialization.dump_candidate_pairs_iter(chunk_results) iter_stream = iterable_to_stream(bytes_iter) mc = connect_to_object_store() try: mc.put_object(Config.MINIO_BUCKET, result_filename, iter_stream, file_size) except minio.ResponseError as err: log.warning("Failed to store result in minio") raise else: result_filename = None file_size = None t5 = time.time() log.info("run={} Comparisons: {}, Links above threshold: {}".format( run_id, comparisons_computed, len(chunk_results))) log.info( "Prep: {:.3f} + {:.3f}, Solve: {:.3f}, Progress: {:.3f}, Save: {:.3f}, Total: {:.3f}" .format(t1 - t0, t2 - t1, t3 - t2, t4 - t3, t5 - t4, t5 - t0)) return num_results, file_size, result_filename
def compute_filter_similarity(chunk_info, project_id, run_id, threshold, encoding_size, parent_span=None): """Compute filter similarity between a chunk of filters in dataprovider 1, and a chunk of filters in dataprovider 2. :param dict chunk_info: A chunk returned by ``anonlink.concurrency.split_to_chunks``. :param project_id: :param run_id: :param threshold: :param encoding_size: The size in bytes of each encoded entry :param parent_span: A serialized opentracing span context. :returns A 3-tuple: (num_results, result size in bytes, results_filename_in_object_store, ) """ log = logger.bind(pid=project_id, run_id=run_id) task_span = compute_filter_similarity.span def new_child_span(name, parent_scope=None): if parent_scope is None: parent_scope = compute_filter_similarity return compute_filter_similarity.tracer.start_active_span( name, child_of=parent_scope.span) log.debug("Computing similarity for a chunk of filters") log.debug( "Checking that the resource exists (in case of run being canceled/deleted)" ) assert_valid_run(project_id, run_id, log) chunk_info_dp1, chunk_info_dp2 = chunk_info with DBConn() as conn: with new_child_span('fetching-encodings') as parent_scope: with new_child_span('fetching-left-encodings', parent_scope): log.debug( "Fetching and deserializing chunk of filters for dataprovider 1" ) chunk_with_ids_dp1, chunk_dp1_size = get_encoding_chunk( conn, chunk_info_dp1, encoding_size) entity_ids_dp1, chunk_dp1 = zip(*chunk_with_ids_dp1) with new_child_span('fetching-right-encodings', parent_scope): log.debug( "Fetching and deserializing chunk of filters for dataprovider 2" ) chunk_with_ids_dp2, chunk_dp2_size = get_encoding_chunk( conn, chunk_info_dp2, encoding_size) entity_ids_dp2, chunk_dp2 = zip(*chunk_with_ids_dp2) log.debug('Both chunks are fetched and deserialized') task_span.log_kv({ 'size1': chunk_dp1_size, 'size2': chunk_dp2_size, 'chunk_info': chunk_info }) assert chunk_dp1_size > 0, "Zero sized chunk in dp1" assert chunk_dp2_size > 0, "Zero sized chunk in dp2" with new_child_span('comparing-encodings') as parent_scope: log.debug("Calculating filter similarity") with new_child_span('dice-call', parent_scope): try: sims, (rec_is0, rec_is1 ) = anonlink.similarities.dice_coefficient_accelerated( datasets=(chunk_dp1, chunk_dp2), threshold=threshold, k=min(chunk_dp1_size, chunk_dp2_size)) except NotImplementedError as e: log.warning("Encodings couldn't be compared using anonlink.") return with new_child_span('reindex-call', parent_scope): def reindex_using_encoding_ids(recordarray, encoding_id_list): # Map results from "index in chunk" to encoding id. return array.array('I', [encoding_id_list[i] for i in recordarray]) rec_is0 = reindex_using_encoding_ids(rec_is0, entity_ids_dp1) rec_is1 = reindex_using_encoding_ids(rec_is1, entity_ids_dp2) log.debug('Encoding similarities calculated') with new_child_span('update-comparison-progress') as scope: # Update the number of comparisons completed comparisons_computed = chunk_dp1_size * chunk_dp2_size save_current_progress(comparisons_computed, run_id) scope.span.log_kv({ 'comparisons': comparisons_computed, 'num_similar': len(sims) }) log.debug("Comparisons: {}, Links above threshold: {}".format( comparisons_computed, len(sims))) with new_child_span('save-comparison-results-to-minio'): num_results = len(sims) if num_results: result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format( generate_code(12)) task_span.log_kv({"edges": num_results}) log.info("Writing {} intermediate results to file: {}".format( num_results, result_filename)) # Make index arrays for serialization index_1 = array.array( 'I', (chunk_info_dp1["datasetIndex"], )) * num_results index_2 = array.array( 'I', (chunk_info_dp2["datasetIndex"], )) * num_results chunk_results = sims, (index_1, index_2), (rec_is0, rec_is1), bytes_iter, file_size \ = anonlink.serialization.dump_candidate_pairs_iter(chunk_results) iter_stream = iterable_to_stream(bytes_iter) mc = connect_to_object_store() try: mc.put_object(Config.MINIO_BUCKET, result_filename, iter_stream, file_size) except minio.ResponseError as err: log.warning("Failed to store result in minio") raise else: result_filename = None file_size = None return num_results, file_size, result_filename
def compute_filter_similarity(chunk_info_dp1, chunk_info_dp2, project_id, run_id, threshold, encoding_size, parent_span=None): """Compute filter similarity between a chunk of filters in dataprovider 1, and a chunk of filters in dataprovider 2. :param chunk_info_dp1: A tuple containing: - object store filename - Chunk start index - Chunk stop index :param chunk_info_dp2: :param project_id: :param threshold: :param encoding_size: The size in bytes of each encoded entry :param parent_span: A serialized opentracing span context. """ log = logger.bind(pid=project_id, run_id=run_id) log.debug("Computing similarity for a chunk of filters") span = compute_filter_similarity.span log.debug("Checking that the resource exists (in case of job being canceled)") with DBConn() as db: if not check_project_exists(db, project_id) or not check_run_exists(db, project_id, run_id): log.info("Stopping as project or run not found in database.") return None t0 = time.time() log.debug("Fetching and deserializing chunk of filters for dataprovider 1") chunk_dp1, chunk_dp1_size = get_chunk_from_object_store(chunk_info_dp1, encoding_size) t1 = time.time() log.debug("Fetching and deserializing chunk of filters for dataprovider 2") chunk_dp2, chunk_dp2_size = get_chunk_from_object_store(chunk_info_dp2, encoding_size) t2 = time.time() span.log_kv({'event': 'chunks are fetched and deserialized'}) log.debug("Calculating filter similarity") span.log_kv({'size1': chunk_dp1_size, 'size2': chunk_dp2_size}) chunk_results = anonlink.entitymatch.calculate_filter_similarity(chunk_dp1, chunk_dp2, threshold=threshold, k=min(chunk_dp1_size, chunk_dp2_size), use_python=False) t3 = time.time() span.log_kv({'event': 'similarities calculated'}) # Update the number of comparisons completed comparisons_computed = chunk_dp1_size * chunk_dp2_size save_current_progress(comparisons_computed, run_id) t4 = time.time() partial_sparse_result = [] # offset chunk's index offset_dp1 = chunk_info_dp1[1] offset_dp2 = chunk_info_dp2[1] log.debug("Offset DP1 by: {}, DP2 by: {}".format(offset_dp1, offset_dp2)) for (ia, score, ib) in chunk_results: partial_sparse_result.append((ia + offset_dp1, ib + offset_dp2, score)) t5 = time.time() num_results = len(partial_sparse_result) if num_results > 0: result_filename = 'chunk-res-{}.csv'.format(generate_code(12)) log.info("Writing {} intermediate results to file: {}".format(num_results, result_filename)) with open(result_filename, 'wt') as f: csvwriter = csv.writer(f) csvwriter.writerows(partial_sparse_result) # Now write these to the object store. and return the filename and summary # Will write a csv file for now mc = connect_to_object_store() try: mc.fput_object(Config.MINIO_BUCKET, result_filename, result_filename) except minio.ResponseError as err: log.warning("Failed to store result in minio") raise # If we don't delete the file we *do* run out of space os.remove(result_filename) else: result_filename = None t6 = time.time() log.info("run={} Comparisons: {}, Links above threshold: {}".format(run_id, comparisons_computed, len(chunk_results))) log.info("Prep: {:.3f} + {:.3f}, Solve: {:.3f}, Progress: {:.3f}, Offset: {:.3f}, Save: {:.3f}, Total: {:.3f}".format( t1 - t0, t2 - t1, t3 - t2, t4 - t3, t4 - t4, t6 - t5, t6 - t0) ) return num_results, result_filename
def project_clks_post(project_id): """ Update a project to provide encoded PII data. """ headers = request.headers log, parent_span = bind_log_and_span(project_id) log.debug("Starting data upload request") token = precheck_upload_token(project_id, headers, parent_span) receipt_token = generate_code() with DBConn() as conn: dp_id = db.get_dataprovider_id(conn, token) project_encoding_size = db.get_project_schema_encoding_size( conn, project_id) upload_state_updated = db.is_dataprovider_allowed_to_upload_and_lock( conn, dp_id) # get flag use_blocking from table projects uses_blocking = get_project_column(conn, project_id, 'uses_blocking') if not upload_state_updated: return safe_fail_request( 403, "This token has already been used to upload clks.") log = log.bind(dp_id=dp_id) log.info("Receiving CLK data.") with opentracing.tracer.start_span('upload-clk-data', child_of=parent_span) as span: span.set_tag("project_id", project_id) try: if headers['Content-Type'] == "application/json": span.set_tag("content-type", 'json') # TODO: Previously, we were accessing the CLKs in a streaming fashion to avoid parsing the json in one hit. This # enables running the web frontend with less memory. # However, as connexion is very, very strict about input validation when it comes to json, it will always # consume the stream first to validate it against the spec. Thus the backflip to fully reading the CLks as # json into memory. -> issue #184 handle_encoding_upload_json(project_id, dp_id, get_json(), receipt_token, uses_blocking, parent_span=span) log.info("Job scheduled to handle users upload") elif headers['Content-Type'] == "application/octet-stream": span.set_tag("content-type", 'binary') log.info("Handling binary CLK upload") try: count, size = check_binary_upload_headers(headers) log.info( f"Headers tell us to expect {count} encodings of {size} bytes" ) span.log_kv({'count': count, 'size': size}) except Exception: log.warning( "Upload failed due to problem with headers in binary upload" ) raise # Check against project level encoding size (if it has been set) if project_encoding_size is not None and size != project_encoding_size: # fail fast - we haven't stored the encoded data yet return safe_fail_request( 400, "Upload 'Hash-Size' doesn't match project settings") # TODO actually stream the upload data straight to Minio. Currently we can't because # connexion has already read the data before our handler is called! # https://github.com/zalando/connexion/issues/592 # stream = get_stream() stream = BytesIO(request.data) expected_bytes = binary_format(size).size * count log.debug( f"Stream size is {len(request.data)} B, and we expect {expected_bytes} B" ) if len(request.data) != expected_bytes: safe_fail_request( 400, "Uploaded data did not match the expected size. Check request headers are correct" ) try: upload_clk_data_binary(project_id, dp_id, stream, receipt_token, count, size) except ValueError: safe_fail_request( 400, "Uploaded data did not match the expected size. Check request headers are correct." ) else: safe_fail_request(400, "Content Type not supported") except ProblemException as e: # Have an exception that is safe for the user. We reset the upload state to # allow the user to try upload again. log.info( f"Problem occurred, returning status={e.status} - {e.detail}") with DBConn() as conn: db.set_dataprovider_upload_state(conn, dp_id, state='not_started') raise except Exception as e: log.warning("Unhandled error occurred during data upload") log.exception(e) with DBConn() as conn: db.set_dataprovider_upload_state(conn, dp_id, state='error') safe_fail_request( 500, "Sorry, the server couldn't handle that request") with DBConn() as conn: db.set_dataprovider_upload_state(conn, dp_id, state='done') # Now work out if all parties have added their data if clks_uploaded_to_project(project_id): logger.info("All parties data present. Scheduling any queued runs") check_for_executable_runs.delay(project_id, serialize_span(parent_span)) return {'message': 'Updated', 'receipt_token': receipt_token}, 201
def project_binaryclks_post(project_id): """ Update a project to provide encoded PII data. """ log, parent_span = bind_log_and_span(project_id) headers = request.headers token = precheck_upload_token(project_id, headers, parent_span) with DBConn() as conn: dp_id = db.get_dataprovider_id(conn, token) project_encoding_size = db.get_project_schema_encoding_size( conn, project_id) upload_state_updated = db.is_dataprovider_allowed_to_upload_and_lock( conn, dp_id) if not upload_state_updated: return safe_fail_request( 403, "This token has already been used to upload clks.") log = log.bind(dp_id=dp_id) log.info("Receiving CLK data.") receipt_token = generate_code() with opentracing.tracer.start_span('upload-clk-data', child_of=parent_span) as span: span.set_tag("project_id", project_id) try: if headers['Content-Type'] == "application/octet-stream": span.set_tag("content-type", 'binary') log.info("Handling binary CLK upload") try: count, size = check_binary_upload_headers(headers) log.info( f"Headers tell us to expect {count} encodings of {size} bytes" ) span.log_kv({'count': count, 'size': size}) except Exception: log.warning( "Upload failed due to problem with headers in binary upload" ) raise # Check against project level encoding size (if it has been set) if project_encoding_size is not None and size != project_encoding_size: # fail fast - we haven't stored the encoded data yet return safe_fail_request( 400, "Upload 'Hash-Size' doesn't match project settings") # TODO actually stream the upload data straight to Minio. Currently we can't because # connexion has already read the data before our handler is called! # https://github.com/zalando/connexion/issues/592 # stream = get_stream() stream = BytesIO(request.data) converted_stream = include_encoding_id_in_binary_stream( stream, size, count) expected_bytes = size * count log.debug( f"Stream size is {len(request.data)} B, and we expect {expected_bytes} B" ) if len(request.data) != expected_bytes: safe_fail_request( 400, "Uploaded data did not match the expected size. Check request headers are correct" ) try: upload_clk_data_binary(project_id, dp_id, converted_stream, receipt_token, count, size) except ValueError: safe_fail_request( 400, "Uploaded data did not match the expected size. Check request headers are correct." ) else: safe_fail_request(400, "Content Type not supported") except Exception: log.warning( "The dataprovider was not able to upload their clks," " re-enable the corresponding upload token to be used.") with DBConn() as conn: db.set_dataprovider_upload_state(conn, dp_id, state='error') raise with DBConn() as conn: db.set_dataprovider_upload_state(conn, dp_id, state='done') # Now work out if all parties have added their data if clks_uploaded_to_project(project_id): logger.info("All parties data present. Scheduling any queued runs") check_for_executable_runs.delay(project_id, serialize_span(parent_span)) return {'message': 'Updated', 'receipt_token': receipt_token}, 201