Esempio n. 1
0
def upload_clk_data_binary(project_id, dp_id, raw_stream, count, size=128):
    """
    Save the user provided raw CLK data.

    """
    receipt_token = generate_code()
    filename = Config.BIN_FILENAME_FMT.format(receipt_token)
    # Set the state to 'pending' in the bloomingdata table
    with DBConn() as conn:
        db.insert_encoding_metadata(conn, filename, dp_id, receipt_token,
                                    count)
        db.update_encoding_metadata_set_encoding_size(conn, dp_id, size)
    logger.info(
        f"Storing supplied binary clks of individual size {size} in file: {filename}"
    )

    num_bytes = count * (size + 6)

    logger.debug(
        "Directly storing binary file with index, base64 encoded CLK, popcount"
    )

    # Upload to object store
    logger.info(
        f"Uploading {count} binary encodings to object store. Total size: {fmt_bytes(num_bytes)}"
    )
    parent_span = g.flask_tracer.get_span()

    with opentracing.tracer.start_span('save-to-minio',
                                       child_of=parent_span) as span:
        mc = connect_to_object_store()
        try:
            mc.put_object(Config.MINIO_BUCKET,
                          filename,
                          data=raw_stream,
                          length=num_bytes)
        except (minio.error.InvalidSizeError, minio.error.InvalidArgumentError,
                minio.error.ResponseError):
            logger.info(
                "Mismatch between expected stream length and header info")
            raise ValueError(
                "Mismatch between expected stream length and header info")

    with opentracing.tracer.start_span('update-database',
                                       child_of=parent_span) as span:
        with DBConn() as conn:
            db.update_encoding_metadata(conn, filename, dp_id, 'ready')
            db.set_dataprovider_upload_state(conn, dp_id, True)

    # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id):
        logger.info("All parties data present. Scheduling any queued runs")
        check_for_executable_runs.delay(project_id,
                                        serialize_span(parent_span))

    return receipt_token
Esempio n. 2
0
def upload_clk_data_binary(project_id,
                           dp_id,
                           encoding_iter,
                           receipt_token,
                           count,
                           size=128):
    """
    Save the user provided binary-packed CLK data.

    """
    filename = None
    # Set the state to 'pending' in the uploads table
    with DBConn() as conn:
        db.insert_encoding_metadata(conn,
                                    filename,
                                    dp_id,
                                    receipt_token,
                                    encoding_count=count,
                                    block_count=1)
        db.update_encoding_metadata_set_encoding_size(conn, dp_id, size)
    num_bytes = binary_format(size).size * count

    logger.debug(
        "Directly storing binary file with index, base64 encoded CLK, popcount"
    )

    # Upload to database
    logger.info(
        f"Uploading {count} binary encodings to database. Total size: {fmt_bytes(num_bytes)}"
    )
    parent_span = g.flask_tracer.get_span()

    with DBConn() as conn:
        db.update_encoding_metadata_set_encoding_size(conn, dp_id, size)

        with opentracing.tracer.start_span('create-default-block-in-db',
                                           child_of=parent_span):
            db.insert_blocking_metadata(conn, dp_id, {DEFAULT_BLOCK_ID: count})

        with opentracing.tracer.start_span('upload-encodings-to-db',
                                           child_of=parent_span):
            store_encodings_in_db(conn, dp_id, encoding_iter, size)

        with opentracing.tracer.start_span('update-encoding-metadata',
                                           child_of=parent_span):
            db.update_encoding_metadata(conn, filename, dp_id, 'ready')
Esempio n. 3
0
def upload_json_clk_data(dp_id, clk_json, parent_span):
    """
    Convert user provided encodings from json array of base64 data into
    a newline separated file of base64 data.

    Note this implementation is non-streaming.
    """
    if 'clks' not in clk_json or len(clk_json['clks']) < 1:
        safe_fail_request(400, message="Missing CLKs information")

    receipt_token = generate_code()

    filename = Config.RAW_FILENAME_FMT.format(receipt_token)
    logger.info("Storing user {} supplied clks from json".format(dp_id))

    with opentracing.tracer.start_span('clk-splitting',
                                       child_of=parent_span) as span:
        count = len(clk_json['clks'])
        span.set_tag("clks", count)
        data = b''.join(''.join(clk.split('\n')).encode() + b'\n'
                        for clk in clk_json['clks'])

        num_bytes = len(data)
        span.set_tag("num_bytes", num_bytes)
        buffer = BytesIO(data)

    logger.info(
        f"Received {count} encodings. Uploading {fmt_bytes(num_bytes)} to object store"
    )
    with opentracing.tracer.start_span('save-to-quarantine',
                                       child_of=parent_span) as span:
        span.set_tag('filename', filename)
        mc = connect_to_object_store()
        mc.put_object(Config.MINIO_BUCKET,
                      filename,
                      data=buffer,
                      length=num_bytes)

    with opentracing.tracer.start_span('update-db',
                                       child_of=parent_span) as span:
        with DBConn() as conn:
            db.insert_encoding_metadata(conn, filename, dp_id, receipt_token,
                                        count)

    return receipt_token, filename
Esempio n. 4
0
def handle_encoding_upload_json(project_id, dp_id, clk_json, receipt_token,
                                uses_blocking, parent_span):
    """
    Take user provided upload information - accepting multiple formats - and eventually
    injest into the database.

    Encodings uploaded directly in the JSON are first quarantined in the object store,
    and a background task deserializes them.

    Encodings that are in an object store are streamed directly into the database by
    a background task.
    """
    log = logger.bind(pid=project_id)
    log.info("Checking json is consistent")
    try:
        abort_if_inconsistent_upload(uses_blocking, clk_json)
    except ValueError as e:
        safe_fail_request(403, e.args[0])

    if "encodings" in clk_json and 'file' in clk_json['encodings']:
        # external encodings
        log.info("External encodings uploaded")
        encoding_object_info = clk_json['encodings']['file']
        object_name = encoding_object_info['path']
        _check_object_path_allowed(project_id, dp_id, object_name, log)

        encoding_credentials = clk_json['encodings'].get('credentials')
        # Schedule a background task to pull the encodings from the object store
        # This background task updates the database with encoding metadata assuming
        # that there are no blocks.
        if 'blocks' not in clk_json:
            log.info("scheduling task to pull encodings from object store")
            pull_external_data_encodings_only.delay(
                project_id,
                dp_id,
                encoding_object_info,
                encoding_credentials,
                receipt_token,
                parent_span=serialize_span(parent_span))
        else:
            # Need to deal with both encodings and blocks
            if 'file' in clk_json['blocks']:
                object_name = clk_json['blocks']['file']['path']
                _check_object_path_allowed(project_id, dp_id, object_name, log)
                # Blocks are in an external file
                blocks_object_info = clk_json['blocks']['file']
                blocks_credentials = clk_json['blocks'].get('credentials')
                log.info(
                    "scheduling task to pull both encodings and blocking data from object store"
                )
                pull_external_data.delay(
                    project_id,
                    dp_id,
                    encoding_object_info,
                    encoding_credentials,
                    blocks_object_info,
                    blocks_credentials,
                    receipt_token,
                    parent_span=serialize_span(parent_span))
            else:
                raise NotImplementedError(
                    "Don't currently handle combination of external encodings and blocks"
                )

        return

    # Convert uploaded JSON to common schema.
    #
    # The original JSON API simply accepted "clks", then came a combined encoding and
    # blocking API expecting the top level element "clknblocks". Finally an API that
    # specifies both "encodings" and "blocks" independently at the top level.
    #
    # We rewrite all into the "clknblocks" format.
    if "encodings" in clk_json:
        logger.debug(
            "converting from 'encodings' & 'blocks' format to 'clknblocks'")
        clk_json = convert_encoding_upload_to_clknblock(clk_json)

    is_valid_clks = not uses_blocking and 'clks' in clk_json
    element = 'clks' if is_valid_clks else 'clknblocks'

    if len(clk_json[element]) < 1:
        safe_fail_request(400, message="Missing CLKs information")

    filename = Config.RAW_FILENAME_FMT.format(receipt_token)
    logger.info("Storing user {} supplied {} from json".format(dp_id, element))

    with opentracing.tracer.start_span('splitting-json-clks',
                                       child_of=parent_span) as span:
        encoding_count = len(clk_json[element])
        span.set_tag(element, encoding_count)
        logger.debug(f"Received {encoding_count} {element}")

    if element == 'clks':
        logger.info("Rewriting provided json into clknsblocks format")
        clk_json = convert_clks_to_clknblocks(clk_json)
        element = 'clknblocks'

    logger.info("Counting block sizes and number of blocks")
    # {'clknblocks': [['UG9vcA==', '001', '211'], [...]]}
    block_sizes = {}
    for _, *elements_blocks in clk_json[element]:
        for el_block in elements_blocks:
            block_sizes[el_block] = block_sizes.setdefault(el_block, 0) + 1
    block_count = len(block_sizes)

    logger.info(f"Received {encoding_count} encodings in {block_count} blocks")
    for block in block_sizes:
        logger.info(f"Block {block} has {block_sizes[block]} elements")

    # write clk_json into a temp file
    tmp = tempfile.NamedTemporaryFile(mode='w')
    json.dump(clk_json, tmp)
    tmp.flush()
    with opentracing.tracer.start_span('save-clk-file-to-quarantine',
                                       child_of=parent_span) as span:
        span.set_tag('filename', filename)
        mc = connect_to_object_store()
        mc.fput_object(Config.MINIO_BUCKET,
                       filename,
                       tmp.name,
                       content_type='application/json')
    logger.info('Saved uploaded {} JSON to file {} in object store.'.format(
        element.upper(), filename))

    with opentracing.tracer.start_span('update-encoding-metadata',
                                       child_of=parent_span):
        with DBConn() as conn:
            db.insert_encoding_metadata(conn, filename, dp_id, receipt_token,
                                        encoding_count, block_count)
            db.insert_blocking_metadata(conn, dp_id, block_sizes)

    # Schedule a task to deserialize the encodings
    handle_raw_upload.delay(project_id,
                            dp_id,
                            receipt_token,
                            parent_span=serialize_span(parent_span))