def __init__(self, result_type, schema, name, notes, parties, uses_blocking):
        logger.debug("Creating project codes")
        self.result_type = result_type
        self.schema = schema
        self.name = name
        self.notes = notes
        self.number_parties = parties
        self.uses_blocking = uses_blocking

        self.project_id = generate_code()
        logger.debug("Generated project code", pid=self.project_id)
        self.result_token = generate_code()

        # Order is important here
        self.update_tokens = [generate_code() for _ in range(parties)]
Exemple #2
0
def upload_clk_data_binary(project_id, dp_id, raw_stream, count, size=128):
    """
    Save the user provided raw CLK data.

    """
    receipt_token = generate_code()
    filename = Config.BIN_FILENAME_FMT.format(receipt_token)
    # Set the state to 'pending' in the bloomingdata table
    with DBConn() as conn:
        db.insert_encoding_metadata(conn, filename, dp_id, receipt_token,
                                    count)
        db.update_encoding_metadata_set_encoding_size(conn, dp_id, size)
    logger.info(
        f"Storing supplied binary clks of individual size {size} in file: {filename}"
    )

    num_bytes = count * (size + 6)

    logger.debug(
        "Directly storing binary file with index, base64 encoded CLK, popcount"
    )

    # Upload to object store
    logger.info(
        f"Uploading {count} binary encodings to object store. Total size: {fmt_bytes(num_bytes)}"
    )
    parent_span = g.flask_tracer.get_span()

    with opentracing.tracer.start_span('save-to-minio',
                                       child_of=parent_span) as span:
        mc = connect_to_object_store()
        try:
            mc.put_object(Config.MINIO_BUCKET,
                          filename,
                          data=raw_stream,
                          length=num_bytes)
        except (minio.error.InvalidSizeError, minio.error.InvalidArgumentError,
                minio.error.ResponseError):
            logger.info(
                "Mismatch between expected stream length and header info")
            raise ValueError(
                "Mismatch between expected stream length and header info")

    with opentracing.tracer.start_span('update-database',
                                       child_of=parent_span) as span:
        with DBConn() as conn:
            db.update_encoding_metadata(conn, filename, dp_id, 'ready')
            db.set_dataprovider_upload_state(conn, dp_id, True)

    # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id):
        logger.info("All parties data present. Scheduling any queued runs")
        check_for_executable_runs.delay(project_id,
                                        serialize_span(parent_span))

    return receipt_token
Exemple #3
0
    def __init__(self, result_type, schema, name, notes, parties):
        logger.debug("Creating project codes")
        self.result_type = result_type
        self.schema = schema
        self.name = name
        self.notes = notes
        self.number_parties = parties

        self.project_id = generate_code()
        logger.debug("Generated project code", pid=self.project_id)
        self.result_token = generate_code()

        # Order is important here
        self.update_tokens = [generate_code() for _ in range(parties)]

        # TODO DELETE?
        self.ready = False
        self.status = 'not ready'
        self.data = {}
        self.result = {}
    def __init__(self, project_id, threshold, name, notes):
        self.project_id = project_id
        self.name = name
        self.notes = notes
        self.threshold = threshold
        self.run_id = generate_code()
        logger.info("Created run id", rid=self.run_id)

        self.type = 'no_mapping' \
            if db.get_project_column(db.get_db(), project_id, 'result_type') == 'similarity_scores' \
            else 'default'
Exemple #5
0
def upload_json_clk_data(dp_id, clk_json, parent_span):
    """
    Convert user provided encodings from json array of base64 data into
    a newline separated file of base64 data.

    Note this implementation is non-streaming.
    """
    if 'clks' not in clk_json or len(clk_json['clks']) < 1:
        safe_fail_request(400, message="Missing CLKs information")

    receipt_token = generate_code()

    filename = Config.RAW_FILENAME_FMT.format(receipt_token)
    logger.info("Storing user {} supplied clks from json".format(dp_id))

    with opentracing.tracer.start_span('clk-splitting',
                                       child_of=parent_span) as span:
        count = len(clk_json['clks'])
        span.set_tag("clks", count)
        data = b''.join(''.join(clk.split('\n')).encode() + b'\n'
                        for clk in clk_json['clks'])

        num_bytes = len(data)
        span.set_tag("num_bytes", num_bytes)
        buffer = BytesIO(data)

    logger.info(
        f"Received {count} encodings. Uploading {fmt_bytes(num_bytes)} to object store"
    )
    with opentracing.tracer.start_span('save-to-quarantine',
                                       child_of=parent_span) as span:
        span.set_tag('filename', filename)
        mc = connect_to_object_store()
        mc.put_object(Config.MINIO_BUCKET,
                      filename,
                      data=buffer,
                      length=num_bytes)

    with opentracing.tracer.start_span('update-db',
                                       child_of=parent_span) as span:
        with DBConn() as conn:
            db.insert_encoding_metadata(conn, filename, dp_id, receipt_token,
                                        count)

    return receipt_token, filename
Exemple #6
0
def _put_placeholder_empty_file(mc, log):
    sims = array.array('d')
    dset_is0 = array.array('I')
    dset_is1 = array.array('I')
    rec_is0 = array.array('I')
    rec_is1 = array.array('I')
    candidate_pairs = sims, (dset_is0, dset_is1), (rec_is0, rec_is1)
    empty_file_iter, empty_file_size \
        = anonlink.serialization.dump_candidate_pairs_iter(candidate_pairs)
    empty_file_name = Config.SIMILARITY_SCORES_FILENAME_FMT.format(
        generate_code(12))
    empty_file_stream = iterable_to_stream(empty_file_iter)
    try:
        mc.put_object(Config.MINIO_BUCKET, empty_file_name, empty_file_stream,
                      empty_file_size)
    except minio.ResponseError:
        log.warning("Failed to store empty result in minio.")
        raise
    return 0, empty_file_size, empty_file_name
Exemple #7
0
def _merge_files(mc, log, file0, file1):
    num0, filesize0, filename0 = file0
    num1, filesize1, filename1 = file1
    total_num = num0 + num1
    file0_stream = mc.get_object(Config.MINIO_BUCKET, filename0)
    file1_stream = mc.get_object(Config.MINIO_BUCKET, filename1)
    merged_file_iter, merged_file_size \
        = anonlink.serialization.merge_streams_iter(
            (file0_stream, file1_stream), sizes=(filesize0, filesize1))
    merged_file_name = Config.SIMILARITY_SCORES_FILENAME_FMT.format(
        generate_code(12))
    merged_file_stream = iterable_to_stream(merged_file_iter)
    try:
        mc.put_object(Config.MINIO_BUCKET, merged_file_name,
                      merged_file_stream, merged_file_size)
    except minio.ResponseError:
        log.warning("Failed to store merged result in minio.")
        raise
    for del_err in mc.remove_objects(Config.MINIO_BUCKET,
                                     (filename0, filename1)):
        log.warning(f"Failed to delete result file "
                    f"{del_err.object_name}. {del_err}")
    return total_num, merged_file_size, merged_file_name
Exemple #8
0
def compute_filter_similarity(package,
                              project_id,
                              run_id,
                              threshold,
                              encoding_size,
                              parent_span=None):
    """Compute filter similarity between a chunk of filters in dataprovider 1,
    and a chunk of filters in dataprovider 2.

    :param dict chunk_info:
        A chunk returned by ``anonlink.concurrency.split_to_chunks``.
    :param project_id:
    :param run_id:
    :param threshold:
    :param encoding_size: The size in bytes of each encoded entry
    :param parent_span: A serialized opentracing span context.
    :returns A 3-tuple: (num_results, result size in bytes, results_filename_in_object_store, )
    """
    log = logger.bind(pid=project_id, run_id=run_id)
    task_span = compute_filter_similarity.span

    def new_child_span(name, parent_scope=None):
        if parent_scope is None:
            parent_scope = compute_filter_similarity
        return compute_filter_similarity.tracer.start_active_span(
            name, child_of=parent_scope.span)

    log.debug(f"Computing similarities for {len(package)} chunks of filters")
    log.debug(
        "Checking that the resource exists (in case of run being canceled/deleted)"
    )
    assert_valid_run(project_id, run_id, log)

    #chunk_info_dp1, chunk_info_dp2 = chunk_info
    def reindex_using_encoding_ids(recordarray, encoding_id_list):
        # Map results from "index in chunk" to encoding id.
        return array.array('I', [encoding_id_list[i] for i in recordarray])

    num_results = 0
    num_comparisons = 0
    sim_results = []

    with DBConn() as conn:
        if len(package) > 1:  # multiple full blocks in one package
            with new_child_span(
                    f'fetching-encodings of package of size {len(package)}'):
                package = get_encoding_chunks(conn,
                                              package,
                                              encoding_size=encoding_size)
        else:  # this chunk is all part of one block
            with new_child_span(f'fetching-encodings of package with 1 chunk'):
                chunk_info_dp1, chunk_info_dp2 = package[0]
                chunk_with_ids_dp1, chunk_dp1_size = get_encoding_chunk(
                    conn, chunk_info_dp1, encoding_size)
                entity_ids_dp1, chunk_dp1 = zip(*chunk_with_ids_dp1)
                chunk_info_dp1['encodings'] = chunk_dp1
                chunk_info_dp1['entity_ids'] = entity_ids_dp1
                chunk_with_ids_dp2, chunk_dp2_size = get_encoding_chunk(
                    conn, chunk_info_dp2, encoding_size)
                entity_ids_dp2, chunk_dp2 = zip(*chunk_with_ids_dp2)
                chunk_info_dp2['encodings'] = chunk_dp2
                chunk_info_dp2['entity_ids'] = entity_ids_dp2
    log.debug('All encodings for package are fetched and deserialized')
    log.debug("Calculating filter similarities for work package")

    with new_child_span('comparing-encodings') as parent_scope:
        for chunk_dp1, chunk_dp2 in package:
            enc_dp1 = chunk_dp1['encodings']
            enc_dp1_size = len(enc_dp1)
            enc_dp2 = chunk_dp2['encodings']
            enc_dp2_size = len(enc_dp2)
            assert enc_dp1_size > 0, "Zero sized chunk in dp1"
            assert enc_dp2_size > 0, "Zero sized chunk in dp2"
            try:
                sims, (rec_is0, rec_is1
                       ) = anonlink.similarities.dice_coefficient_accelerated(
                           datasets=(enc_dp1, enc_dp2),
                           threshold=threshold,
                           k=min(enc_dp1_size, enc_dp2_size))
            except NotImplementedError as e:
                log.warning(
                    f"Encodings couldn't be compared using anonlink. {e}")
                return
            rec_is0 = reindex_using_encoding_ids(rec_is0,
                                                 chunk_dp1['entity_ids'])
            rec_is1 = reindex_using_encoding_ids(rec_is1,
                                                 chunk_dp2['entity_ids'])
            num_results += len(sims)
            num_comparisons += enc_dp1_size * enc_dp2_size
            sim_results.append(
                (sims, (rec_is0, rec_is1), chunk_dp1['datasetIndex'],
                 chunk_dp2['datasetIndex']))
        log.debug(
            f'comparison is done. {num_comparisons} comparisons got {num_results} pairs above the threshold'
        )

##### progess reporting
    log.debug('Encoding similarities calculated')

    with new_child_span('update-comparison-progress') as scope:
        # Update the number of comparisons completed
        save_current_progress(num_comparisons, run_id)
        scope.span.log_kv({
            'comparisons': num_comparisons,
            'num_similar': num_results
        })
        log.debug("Comparisons: {}, Links above threshold: {}".format(
            num_comparisons, num_results))


###### results into file into minio
    with new_child_span('save-comparison-results-to-minio'):

        file_iters = []
        file_sizes = []
        for sims, (rec_is0, rec_is1), dp1_ds_idx, dp2_ds_idx in sim_results:
            num_sims = len(sims)

            if num_sims:
                # Make index arrays for serialization
                index_1 = array.array('I', (dp1_ds_idx, )) * num_sims
                index_2 = array.array('I', (dp2_ds_idx, )) * num_sims
                chunk_results = sims, (index_1, index_2), (rec_is0, rec_is1),
                bytes_iter, file_size \
                    = anonlink.serialization.dump_candidate_pairs_iter(chunk_results)
                file_iters.append(iterable_to_stream(bytes_iter))
                file_sizes.append(file_size)

        if len(file_iters) > 1:
            # we need to merge them first into one ordered thingy
            merged_file_iter, merged_file_size \
                = anonlink.serialization.merge_streams_iter(file_iters, sizes=file_sizes)
            merged_file_iter = iterable_to_stream(merged_file_iter)
        elif len(file_iters) == 1:
            merged_file_iter = file_iters[0]
            merged_file_size = file_sizes[0]
        else:
            return 0, None, None

        result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format(
            generate_code(12))
        task_span.log_kv({"edges": num_results})
        log.info("Writing {} intermediate results to file: {}".format(
            num_results, result_filename))

        mc = connect_to_object_store()
        try:
            mc.put_object(Config.MINIO_BUCKET, result_filename,
                          merged_file_iter, merged_file_size)
        except minio.ResponseError as err:
            log.warning("Failed to store result in minio: {}".format(err))
            raise

    return num_results, merged_file_size, result_filename
Exemple #9
0
 def test_insert_dp_no_project_fails(self):
     conn, cur = _get_conn_and_cursor()
     project_id = generate_code()
     dp_auth = generate_code()
     with raises(psycopg2.errors.ForeignKeyViolation):
         insert_dataprovider(cur, auth_token=dp_auth, project_id=project_id)
def compute_filter_similarity(chunk_info,
                              project_id,
                              run_id,
                              threshold,
                              encoding_size,
                              parent_span=None):
    """Compute filter similarity between a chunk of filters in dataprovider 1,
    and a chunk of filters in dataprovider 2.

    :param chunk_info:
        Chunk info returned by ``anonlink.concurrency.split_to_chunks``.
        Additionally, "storeFilename" is added to each dataset chunk.
    :param project_id:
    :param threshold:
    :param encoding_size: The size in bytes of each encoded entry
    :param parent_span: A serialized opentracing span context.
    @returns A 2-tuple: (num_results, results_filename_in_object_store)
    """
    log = logger.bind(pid=project_id, run_id=run_id)
    log.debug("Computing similarity for a chunk of filters")
    span = compute_filter_similarity.span
    log.debug(
        "Checking that the resource exists (in case of job being canceled)")
    with DBConn() as db:
        if not check_project_exists(db, project_id) or not check_run_exists(
                db, project_id, run_id):
            log.info("Failing task as project or run not found in database.")
            raise DBResourceMissing("project or run not found in database")

    chunk_info_dp1, chunk_info_dp2 = chunk_info

    t0 = time.time()
    log.debug("Fetching and deserializing chunk of filters for dataprovider 1")
    chunk_dp1, chunk_dp1_size = get_chunk_from_object_store(
        chunk_info_dp1, encoding_size)

    t1 = time.time()
    log.debug("Fetching and deserializing chunk of filters for dataprovider 2")
    chunk_dp2, chunk_dp2_size = get_chunk_from_object_store(
        chunk_info_dp2, encoding_size)
    t2 = time.time()
    span.log_kv({'event': 'chunks are fetched and deserialized'})
    log.debug("Calculating filter similarity")
    span.log_kv({'size1': chunk_dp1_size, 'size2': chunk_dp2_size})
    chunk_results = anonlink.concurrency.process_chunk(
        chunk_info, (chunk_dp1, chunk_dp2),
        anonlink.similarities.dice_coefficient_accelerated,
        threshold,
        k=min(chunk_dp1_size, chunk_dp2_size))
    t3 = time.time()
    span.log_kv({'event': 'similarities calculated'})

    # Update the number of comparisons completed
    comparisons_computed = chunk_dp1_size * chunk_dp2_size
    save_current_progress(comparisons_computed, run_id)

    t4 = time.time()

    sims, _, _ = chunk_results
    num_results = len(sims)

    if num_results:
        result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format(
            generate_code(12))
        log.info("Writing {} intermediate results to file: {}".format(
            num_results, result_filename))

        bytes_iter, file_size \
            = anonlink.serialization.dump_candidate_pairs_iter(chunk_results)
        iter_stream = iterable_to_stream(bytes_iter)

        mc = connect_to_object_store()
        try:
            mc.put_object(Config.MINIO_BUCKET, result_filename, iter_stream,
                          file_size)
        except minio.ResponseError as err:
            log.warning("Failed to store result in minio")
            raise
    else:
        result_filename = None
        file_size = None
    t5 = time.time()

    log.info("run={} Comparisons: {}, Links above threshold: {}".format(
        run_id, comparisons_computed, len(chunk_results)))
    log.info(
        "Prep: {:.3f} + {:.3f}, Solve: {:.3f}, Progress: {:.3f}, Save: {:.3f}, Total: {:.3f}"
        .format(t1 - t0, t2 - t1, t3 - t2, t4 - t3, t5 - t4, t5 - t0))
    return num_results, file_size, result_filename
Exemple #11
0
def compute_filter_similarity(chunk_info,
                              project_id,
                              run_id,
                              threshold,
                              encoding_size,
                              parent_span=None):
    """Compute filter similarity between a chunk of filters in dataprovider 1,
    and a chunk of filters in dataprovider 2.

    :param dict chunk_info:
        A chunk returned by ``anonlink.concurrency.split_to_chunks``.
    :param project_id:
    :param run_id:
    :param threshold:
    :param encoding_size: The size in bytes of each encoded entry
    :param parent_span: A serialized opentracing span context.
    :returns A 3-tuple: (num_results, result size in bytes, results_filename_in_object_store, )
    """
    log = logger.bind(pid=project_id, run_id=run_id)
    task_span = compute_filter_similarity.span

    def new_child_span(name, parent_scope=None):
        if parent_scope is None:
            parent_scope = compute_filter_similarity
        return compute_filter_similarity.tracer.start_active_span(
            name, child_of=parent_scope.span)

    log.debug("Computing similarity for a chunk of filters")
    log.debug(
        "Checking that the resource exists (in case of run being canceled/deleted)"
    )
    assert_valid_run(project_id, run_id, log)

    chunk_info_dp1, chunk_info_dp2 = chunk_info

    with DBConn() as conn:
        with new_child_span('fetching-encodings') as parent_scope:
            with new_child_span('fetching-left-encodings', parent_scope):
                log.debug(
                    "Fetching and deserializing chunk of filters for dataprovider 1"
                )
                chunk_with_ids_dp1, chunk_dp1_size = get_encoding_chunk(
                    conn, chunk_info_dp1, encoding_size)
                entity_ids_dp1, chunk_dp1 = zip(*chunk_with_ids_dp1)

            with new_child_span('fetching-right-encodings', parent_scope):
                log.debug(
                    "Fetching and deserializing chunk of filters for dataprovider 2"
                )
                chunk_with_ids_dp2, chunk_dp2_size = get_encoding_chunk(
                    conn, chunk_info_dp2, encoding_size)
                entity_ids_dp2, chunk_dp2 = zip(*chunk_with_ids_dp2)

    log.debug('Both chunks are fetched and deserialized')
    task_span.log_kv({
        'size1': chunk_dp1_size,
        'size2': chunk_dp2_size,
        'chunk_info': chunk_info
    })

    assert chunk_dp1_size > 0, "Zero sized chunk in dp1"
    assert chunk_dp2_size > 0, "Zero sized chunk in dp2"

    with new_child_span('comparing-encodings') as parent_scope:

        log.debug("Calculating filter similarity")
        with new_child_span('dice-call', parent_scope):
            try:
                sims, (rec_is0, rec_is1
                       ) = anonlink.similarities.dice_coefficient_accelerated(
                           datasets=(chunk_dp1, chunk_dp2),
                           threshold=threshold,
                           k=min(chunk_dp1_size, chunk_dp2_size))
            except NotImplementedError as e:
                log.warning("Encodings couldn't be compared using anonlink.")
                return

        with new_child_span('reindex-call', parent_scope):

            def reindex_using_encoding_ids(recordarray, encoding_id_list):
                # Map results from "index in chunk" to encoding id.
                return array.array('I',
                                   [encoding_id_list[i] for i in recordarray])

            rec_is0 = reindex_using_encoding_ids(rec_is0, entity_ids_dp1)
            rec_is1 = reindex_using_encoding_ids(rec_is1, entity_ids_dp2)

    log.debug('Encoding similarities calculated')

    with new_child_span('update-comparison-progress') as scope:
        # Update the number of comparisons completed
        comparisons_computed = chunk_dp1_size * chunk_dp2_size
        save_current_progress(comparisons_computed, run_id)
        scope.span.log_kv({
            'comparisons': comparisons_computed,
            'num_similar': len(sims)
        })
        log.debug("Comparisons: {}, Links above threshold: {}".format(
            comparisons_computed, len(sims)))

    with new_child_span('save-comparison-results-to-minio'):
        num_results = len(sims)

        if num_results:
            result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format(
                generate_code(12))
            task_span.log_kv({"edges": num_results})
            log.info("Writing {} intermediate results to file: {}".format(
                num_results, result_filename))

            # Make index arrays for serialization
            index_1 = array.array(
                'I', (chunk_info_dp1["datasetIndex"], )) * num_results
            index_2 = array.array(
                'I', (chunk_info_dp2["datasetIndex"], )) * num_results

            chunk_results = sims, (index_1, index_2), (rec_is0, rec_is1),

            bytes_iter, file_size \
                = anonlink.serialization.dump_candidate_pairs_iter(chunk_results)
            iter_stream = iterable_to_stream(bytes_iter)

            mc = connect_to_object_store()
            try:
                mc.put_object(Config.MINIO_BUCKET, result_filename,
                              iter_stream, file_size)
            except minio.ResponseError as err:
                log.warning("Failed to store result in minio")
                raise
        else:
            result_filename = None
            file_size = None

    return num_results, file_size, result_filename
Exemple #12
0
def compute_filter_similarity(chunk_info_dp1, chunk_info_dp2, project_id, run_id, threshold, encoding_size, parent_span=None):
    """Compute filter similarity between a chunk of filters in dataprovider 1,
    and a chunk of filters in dataprovider 2.

    :param chunk_info_dp1:
        A tuple containing:
            - object store filename
            - Chunk start index
            - Chunk stop index
    :param chunk_info_dp2:
    :param project_id:
    :param threshold:
    :param encoding_size: The size in bytes of each encoded entry
    :param parent_span: A serialized opentracing span context.
    """
    log = logger.bind(pid=project_id, run_id=run_id)
    log.debug("Computing similarity for a chunk of filters")
    span = compute_filter_similarity.span
    log.debug("Checking that the resource exists (in case of job being canceled)")
    with DBConn() as db:
        if not check_project_exists(db, project_id) or not check_run_exists(db, project_id, run_id):
            log.info("Stopping as project or run not found in database.")
            return None

    t0 = time.time()
    log.debug("Fetching and deserializing chunk of filters for dataprovider 1")
    chunk_dp1, chunk_dp1_size = get_chunk_from_object_store(chunk_info_dp1, encoding_size)

    t1 = time.time()
    log.debug("Fetching and deserializing chunk of filters for dataprovider 2")
    chunk_dp2, chunk_dp2_size = get_chunk_from_object_store(chunk_info_dp2, encoding_size)
    t2 = time.time()
    span.log_kv({'event': 'chunks are fetched and deserialized'})
    log.debug("Calculating filter similarity")
    span.log_kv({'size1': chunk_dp1_size, 'size2': chunk_dp2_size})
    chunk_results = anonlink.entitymatch.calculate_filter_similarity(chunk_dp1, chunk_dp2,
                                                                     threshold=threshold,
                                                                     k=min(chunk_dp1_size, chunk_dp2_size),
                                                                     use_python=False)
    t3 = time.time()
    span.log_kv({'event': 'similarities calculated'})

    # Update the number of comparisons completed
    comparisons_computed = chunk_dp1_size * chunk_dp2_size
    save_current_progress(comparisons_computed, run_id)

    t4 = time.time()

    partial_sparse_result = []
    # offset chunk's index
    offset_dp1 = chunk_info_dp1[1]
    offset_dp2 = chunk_info_dp2[1]

    log.debug("Offset DP1 by: {}, DP2 by: {}".format(offset_dp1, offset_dp2))
    for (ia, score, ib) in chunk_results:
        partial_sparse_result.append((ia + offset_dp1, ib + offset_dp2, score))

    t5 = time.time()

    num_results = len(partial_sparse_result)
    if num_results > 0:
        result_filename = 'chunk-res-{}.csv'.format(generate_code(12))
        log.info("Writing {} intermediate results to file: {}".format(num_results, result_filename))

        with open(result_filename, 'wt') as f:
            csvwriter = csv.writer(f)
            csvwriter.writerows(partial_sparse_result)

        # Now write these to the object store. and return the filename and summary
        # Will write a csv file for now
        mc = connect_to_object_store()
        try:
            mc.fput_object(Config.MINIO_BUCKET, result_filename, result_filename)
        except minio.ResponseError as err:
            log.warning("Failed to store result in minio")
            raise

        # If we don't delete the file we *do* run out of space
        os.remove(result_filename)
    else:
        result_filename = None
    t6 = time.time()

    log.info("run={} Comparisons: {}, Links above threshold: {}".format(run_id, comparisons_computed, len(chunk_results)))
    log.info("Prep: {:.3f} + {:.3f}, Solve: {:.3f}, Progress: {:.3f}, Offset: {:.3f}, Save: {:.3f}, Total: {:.3f}".format(
        t1 - t0,
        t2 - t1,
        t3 - t2,
        t4 - t3,
        t4 - t4,
        t6 - t5,
        t6 - t0)
    )
    return num_results, result_filename
Exemple #13
0
def project_clks_post(project_id):
    """
    Update a project to provide encoded PII data.
    """

    headers = request.headers

    log, parent_span = bind_log_and_span(project_id)
    log.debug("Starting data upload request")
    token = precheck_upload_token(project_id, headers, parent_span)
    receipt_token = generate_code()
    with DBConn() as conn:
        dp_id = db.get_dataprovider_id(conn, token)
        project_encoding_size = db.get_project_schema_encoding_size(
            conn, project_id)
        upload_state_updated = db.is_dataprovider_allowed_to_upload_and_lock(
            conn, dp_id)
        # get flag use_blocking from table projects
        uses_blocking = get_project_column(conn, project_id, 'uses_blocking')

    if not upload_state_updated:
        return safe_fail_request(
            403, "This token has already been used to upload clks.")

    log = log.bind(dp_id=dp_id)
    log.info("Receiving CLK data.")

    with opentracing.tracer.start_span('upload-clk-data',
                                       child_of=parent_span) as span:
        span.set_tag("project_id", project_id)
        try:
            if headers['Content-Type'] == "application/json":
                span.set_tag("content-type", 'json')
                # TODO: Previously, we were accessing the CLKs in a streaming fashion to avoid parsing the json in one hit. This
                #       enables running the web frontend with less memory.
                #       However, as connexion is very, very strict about input validation when it comes to json, it will always
                #       consume the stream first to validate it against the spec. Thus the backflip to fully reading the CLks as
                #       json into memory. -> issue #184
                handle_encoding_upload_json(project_id,
                                            dp_id,
                                            get_json(),
                                            receipt_token,
                                            uses_blocking,
                                            parent_span=span)

                log.info("Job scheduled to handle users upload")
            elif headers['Content-Type'] == "application/octet-stream":
                span.set_tag("content-type", 'binary')
                log.info("Handling binary CLK upload")
                try:
                    count, size = check_binary_upload_headers(headers)
                    log.info(
                        f"Headers tell us to expect {count} encodings of {size} bytes"
                    )
                    span.log_kv({'count': count, 'size': size})
                except Exception:
                    log.warning(
                        "Upload failed due to problem with headers in binary upload"
                    )
                    raise
                # Check against project level encoding size (if it has been set)
                if project_encoding_size is not None and size != project_encoding_size:
                    # fail fast - we haven't stored the encoded data yet
                    return safe_fail_request(
                        400,
                        "Upload 'Hash-Size' doesn't match project settings")

                # TODO actually stream the upload data straight to Minio. Currently we can't because
                # connexion has already read the data before our handler is called!
                # https://github.com/zalando/connexion/issues/592
                # stream = get_stream()
                stream = BytesIO(request.data)
                expected_bytes = binary_format(size).size * count
                log.debug(
                    f"Stream size is {len(request.data)} B, and we expect {expected_bytes} B"
                )
                if len(request.data) != expected_bytes:
                    safe_fail_request(
                        400,
                        "Uploaded data did not match the expected size. Check request headers are correct"
                    )
                try:
                    upload_clk_data_binary(project_id, dp_id, stream,
                                           receipt_token, count, size)
                except ValueError:
                    safe_fail_request(
                        400,
                        "Uploaded data did not match the expected size. Check request headers are correct."
                    )
            else:
                safe_fail_request(400, "Content Type not supported")
        except ProblemException as e:
            # Have an exception that is safe for the user. We reset the upload state to
            # allow the user to try upload again.
            log.info(
                f"Problem occurred, returning status={e.status} - {e.detail}")
            with DBConn() as conn:
                db.set_dataprovider_upload_state(conn,
                                                 dp_id,
                                                 state='not_started')
            raise
        except Exception as e:
            log.warning("Unhandled error occurred during data upload")
            log.exception(e)
            with DBConn() as conn:
                db.set_dataprovider_upload_state(conn, dp_id, state='error')
            safe_fail_request(
                500, "Sorry, the server couldn't handle that request")

    with DBConn() as conn:
        db.set_dataprovider_upload_state(conn, dp_id, state='done')

    # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id):
        logger.info("All parties data present. Scheduling any queued runs")
        check_for_executable_runs.delay(project_id,
                                        serialize_span(parent_span))

    return {'message': 'Updated', 'receipt_token': receipt_token}, 201
Exemple #14
0
def project_binaryclks_post(project_id):
    """
    Update a project to provide encoded PII data.
    """
    log, parent_span = bind_log_and_span(project_id)
    headers = request.headers
    token = precheck_upload_token(project_id, headers, parent_span)

    with DBConn() as conn:
        dp_id = db.get_dataprovider_id(conn, token)
        project_encoding_size = db.get_project_schema_encoding_size(
            conn, project_id)
        upload_state_updated = db.is_dataprovider_allowed_to_upload_and_lock(
            conn, dp_id)

    if not upload_state_updated:
        return safe_fail_request(
            403, "This token has already been used to upload clks.")

    log = log.bind(dp_id=dp_id)
    log.info("Receiving CLK data.")
    receipt_token = generate_code()

    with opentracing.tracer.start_span('upload-clk-data',
                                       child_of=parent_span) as span:
        span.set_tag("project_id", project_id)
        try:
            if headers['Content-Type'] == "application/octet-stream":
                span.set_tag("content-type", 'binary')
                log.info("Handling binary CLK upload")
                try:
                    count, size = check_binary_upload_headers(headers)
                    log.info(
                        f"Headers tell us to expect {count} encodings of {size} bytes"
                    )
                    span.log_kv({'count': count, 'size': size})
                except Exception:
                    log.warning(
                        "Upload failed due to problem with headers in binary upload"
                    )
                    raise
                # Check against project level encoding size (if it has been set)
                if project_encoding_size is not None and size != project_encoding_size:
                    # fail fast - we haven't stored the encoded data yet
                    return safe_fail_request(
                        400,
                        "Upload 'Hash-Size' doesn't match project settings")

                # TODO actually stream the upload data straight to Minio. Currently we can't because
                # connexion has already read the data before our handler is called!
                # https://github.com/zalando/connexion/issues/592
                # stream = get_stream()
                stream = BytesIO(request.data)

                converted_stream = include_encoding_id_in_binary_stream(
                    stream, size, count)

                expected_bytes = size * count
                log.debug(
                    f"Stream size is {len(request.data)} B, and we expect {expected_bytes} B"
                )
                if len(request.data) != expected_bytes:
                    safe_fail_request(
                        400,
                        "Uploaded data did not match the expected size. Check request headers are correct"
                    )
                try:
                    upload_clk_data_binary(project_id, dp_id, converted_stream,
                                           receipt_token, count, size)
                except ValueError:
                    safe_fail_request(
                        400,
                        "Uploaded data did not match the expected size. Check request headers are correct."
                    )
            else:
                safe_fail_request(400, "Content Type not supported")
        except Exception:
            log.warning(
                "The dataprovider was not able to upload their clks,"
                " re-enable the corresponding upload token to be used.")

            with DBConn() as conn:
                db.set_dataprovider_upload_state(conn, dp_id, state='error')
            raise
    with DBConn() as conn:
        db.set_dataprovider_upload_state(conn, dp_id, state='done')

    # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id):
        logger.info("All parties data present. Scheduling any queued runs")
        check_for_executable_runs.delay(project_id,
                                        serialize_span(parent_span))

    return {'message': 'Updated', 'receipt_token': receipt_token}, 201