예제 #1
0
def test_from_type():
    pbmc = Template.from_type("pbmc")

    assert "Shipment" in pbmc.worksheets
    assert "Samples" in pbmc.worksheets
    assert "WES" in Template.from_type("wes_fastq").worksheets

    with pytest.raises(Exception, match="unknown template type"):
        Template.from_type("foo")
예제 #2
0
def test_template_arbitrary_data_section():
    schema = {
        "properties": {
            "worksheets": {
                "worksheet_1": {
                    "prism_data_object_pointer":
                    "/prism_data_object_pointer/-",
                    "prism_arbitrary_data_section": "extra_annotations",
                    "prism_arbitrary_data_merge_pointer":
                    "/extra_annotations_sub_object",
                    "data_columns": {
                        "section_1": {
                            "data_field_1": {
                                "merge_pointer": "/data_field",
                                "type": "number",
                            }
                        }
                    },
                }
            }
        }
    }

    template = Template(schema, type="adhoc_arbitrary_data_test_template")

    # not throwing on expected
    changes, _ = template.process_field_value("worksheet_1", "data_field_1",
                                              "123", {}, {})

    assert len(changes) == 1
    assert changes[0].pointer == "/data_field"
    assert changes[0].value == 123.0

    # process_field_value DOESN'T throw a ParsingException
    # on arbitrary, not predefined fields
    changes, _ = template.process_field_value("worksheet_1",
                                              "unexpected_property", 321, {},
                                              {})

    assert len(changes) == 1
    assert changes[
        0].pointer == "/extra_annotations_sub_object/unexpected_property"
    assert changes[0].value == 321

    # Checking different keys sanitization
    # TODO - figure out and add more
    changes, _ = template.process_field_value("worksheet_1",
                                              "unexpected '\"property", 321,
                                              {}, {})

    assert changes[
        0].pointer == "/extra_annotations_sub_object/unexpected '\"property"
예제 #3
0
def test_worksheet_processing():
    """Ensure that worksheet schemas are processed as expected"""
    worksheet = {
        "preamble_rows": {
            # should be converted to lowercase
            "aAa": {}
        },
        "data_columns": {
            # shouldn't be converted to lowercase
            "One": {
                # should be converted to lowercase
                "BbB": {}
            }
        }
    }

    target = {
        "preamble_rows": {
            "aaa": {}
        },
        "data_columns": {
            "One": {
                "bbb": {}
            }
        }
    }

    assert Template._process_worksheet(worksheet) == target
예제 #4
0
def tiny_template():
    """A small, valid """

    test_property = {'$id': 'success', 'type': 'string'}
    test_date = {'type': 'string', 'format': 'date'}
    test_time = {'type': 'string', 'format': 'time'}
    test_fields = {
        'test_property': test_property,
        'test_date': test_date,
        'test_time': test_time
    }

    tiny_template_schema = {
        '$id': 'tiny_template',
        'title': 'Tiny Manifest',
        'properties': {
            'worksheets': {
                'TEST_SHEET': {
                    'preamble_rows': test_fields,
                    'data_columns': {
                        'first table': test_fields,
                        'another table': test_fields
                    }
                },
            }
        }
    }

    return Template(tiny_template_schema)
예제 #5
0
def run(ts_path: str, mif_path: str, he_path: str, outdir: str):
    """Run and profile a typical metadata validation and merging workload."""
    set_prism_encrypt_key("foobar")

    with profiling("1_prismify_tissue_slide_shipping_manifest", outdir):
        ts_template = Template.from_type("tissue_slide")
        ts_spreadsheet, _ = XlTemplateReader.from_excel(ts_path)
        ts_metadata, _, _ = prismify(ts_spreadsheet, ts_template)
        ts_metadata["allowed_cohort_names"] = ["Not_reported"]
        ts_metadata["allowed_collection_event_names"] = ["Baseline"]

    with profiling("2_prismify_mif_assay_metadata_spreadsheet", outdir):
        mif_template = Template.from_type("mif")
        mif_spreadsheet, _ = XlTemplateReader.from_excel(mif_path)
        mif_metadata, files, _ = prismify(mif_spreadsheet, mif_template)

    with profiling("3_merge_mif_assay_artifacts_into_mif_metadata_patch", outdir):
        # tqdm gives us a stdout progress indicator as prism iterates through the array
        artifact_info = tqdm(
            [
                ArtifactInfo(
                    f.upload_placeholder,
                    f"object/url/{f.upload_placeholder}",
                    "",
                    0,
                    "",
                    "abcd",
                )
                for i, f in enumerate(files)
            ]
        )
        mif_metadata, _ = merge_artifacts(mif_metadata, artifact_info)

    with profiling("4_merge_mif_metadata_with_tissue_slide_metadata", outdir):
        combined_metadata, _ = merge_clinical_trial_metadata(mif_metadata, ts_metadata)

    # Don't profile this a second time, since we're only interested
    # in how long it takes to merge the shipping manifest data into
    # existing trial metadata
    he_template = Template.from_type("h_and_e")
    he_spreadsheet, _ = XlTemplateReader.from_excel(he_path)
    he_metadata, _, _ = prismify(he_spreadsheet, he_template)

    with profiling("5_merge_h_and_e_metadata_into_trial", outdir):
        merge_clinical_trial_metadata(he_metadata, combined_metadata)
예제 #6
0
def template_set():
    """
    Get the path to every template schema in the schemas/templates directory
    and their corresponding xlsx example file. 
    """
    # Collect template xlsx examples
    for templ_type in _TEMPLATE_PATH_MAP:
        xlsx_path = os.path.join(TEMPLATE_EXAMPLES_DIR,
                                 f"{templ_type}_template.xlsx")
        templ = Template.from_type(templ_type)
        yield (templ, xlsx_path)
예제 #7
0
def tiny_template():
    """A small, valid """

    test_property = {
        "$id": "test_property",
        "type": "string",
        "merge_pointer": "test_property",
    }
    test_date = {
        "type": "string",
        "format": "date",
        "merge_pointer": "test_date"
    }
    test_time = {
        "type": "string",
        "format": "time",
        "merge_pointer": "test_time"
    }
    test_enum = {
        "type": "string",
        "enum": ["enum_val_1", "enum_val_2"],
        "merge_pointer": "test_enum",
    }
    test_number = {"type": "number", "merge_pointer": "test_number"}
    test_fields = {
        "test_property": test_property,
        "test_date": test_date,
        "test_time": test_time,
        "test_number": test_number,
        "test_enum": test_enum,
    }

    tiny_template_schema = {
        "$id": "tiny_template",
        "title": "Tiny Manifest",
        "properties": {
            "worksheets": {
                "TEST_SHEET": {
                    "preamble_rows": test_fields,
                    "data_columns": {
                        "first table": test_fields,
                        "another table": test_fields,
                    },
                }
            }
        },
    }

    return Template(tiny_template_schema, "test_tiny")
예제 #8
0
def test_template(template, template_example, template_example_xlsx_path,
                  tmpdir):
    """
    Ensure the template schema generates a spreadsheet that looks like the given example,
    and check that the template example is valid.
    """

    # write template to a temporary file
    p = tmpdir.join("test_output.xlsx")
    template.to_excel(p)
    generated_template, err = XlTemplateReader.from_excel(p)
    assert not err

    reference_template = template_example

    # Check that both templates have the same fields
    compare_templates(template.type, generated_template, reference_template)

    # Validate the Excel template
    assert reference_template.validate(template)

    # Ensure the example Excel template isn't valid as any other template
    for other_template_type in _TEMPLATE_PATH_MAP:
        if other_template_type == template.type:
            # don't check it against itself
            continue
        elif (other_template_type.startswith("cytof_")
              and other_template_type.endswith("_analysis")
              and template.type.startswith("cytof_")
              and template.type.endswith("_analysis")):
            # cytof_<trial>_analysis might cross validate which is fine
            continue

        other_template = Template.from_type(other_template_type)
        with pytest.raises(ValidationError):
            other_template.validate_excel(template_example_xlsx_path)

    # Ensure that the data dictionary tab in this template doesn't have empty columns
    generated_xlsx = openpyxl.load_workbook(p)
    data_dict_ws = generated_xlsx[XlTemplateWriter._data_dict_sheet_name]
    for col in data_dict_ws.iter_cols(min_col=2,
                                      max_col=50,
                                      max_row=10,
                                      values_only=True):
        [header, *values] = col
        if header is None:
            break
        assert any(val is not None for val in values)
예제 #9
0
def test_process_field_value():

    schema = {
        "properties": {
            "worksheets": {
                "worksheet_1": {
                    "prism_preamble_object_pointer":
                    "/prism_preamble_object_pointer/0",
                    "preamble_rows": {
                        "preamble_field_1": {
                            "merge_pointer": "/preamble_field",
                            "type": "string",
                        }
                    },
                    "prism_data_object_pointer":
                    "/prism_data_object_pointer/-",
                    "data_columns": {
                        "section_1": {
                            "data_field_1": {
                                "merge_pointer": "/data_field",
                                "type": "number",
                            }
                        }
                    },
                }
            }
        }
    }

    template = Template(schema, type="adhoc_test_template")

    # process_field_value throws a ParsingException on properties missing from the key lookup dict
    with pytest.raises(ParsingException, match="Unexpected property"):
        template.process_field_value("worksheet_1", "unexpected_prop", "123",
                                     {}, {})

    with pytest.raises(ParsingException, match="Unexpected worksheet"):
        template.process_field_value("unexpected_worksheet", "whatever", "123",
                                     {}, {})

    # not throwing on expected
    template.process_field_value("worksheet_1", "data_field_1", "123", {}, {})
예제 #10
0
def extract_schema_and_xlsx(allowed_types: List[str]) -> Tuple[str, BinaryIO]:
    """
    Validate that a request has the required structure, then extract
    the schema id and template file from the request. The request must
    have a multipart/form body with one field "schema" referencing a valid schema id
    and another field "template" with an attached .xlsx file.

    Raises:
        BadRequest: if the above requirements aren't satisfied

    Returns:
        Tuple[Template, BinaryIO]: template, and the open xlsx file
    """
    # If there is no form attribute on the request object,
    # then either one wasn't supplied, or it was malformed
    if not request.form:
        raise BadRequest(
            "Expected form content in request body, or failed to parse form content"
        )

    # If we have a form, check that the expected template file exists
    if "template" not in request.files:
        raise BadRequest("Expected a template file in request body")

    # Check that the template file appears to be a .xlsx file
    xlsx_file = request.files["template"]
    if xlsx_file.filename and not is_xlsx(xlsx_file.filename):
        raise BadRequest("Expected a .xlsx file")

    # Check that a schema id was provided and that a corresponding schema exists
    schema_id = request.form.get("schema")
    if not schema_id:
        raise BadRequest("Expected a form entry for 'schema'")
    schema_id = schema_id.lower()

    # Check that the schema type is allowed
    if schema_id not in allowed_types:
        raise BadRequest(
            f"Schema type '{schema_id}' is not supported for this endpoint. Available options: {allowed_types}"
        )

    template = Template.from_type(schema_id)

    return template, xlsx_file
예제 #11
0
def test_template(schema_path, xlsx_path, tmpdir):
    """
    Ensure the template schema generates a spreadsheet that looks like the given example,
    and check that the template example is valid.
    """

    # Load the template and write it to a temporary file
    template = Template.from_json(schema_path, SCHEMA_DIR)
    p = tmpdir.join('test_output.xlsx')
    template.to_excel(p)
    generated_template = XlTemplateReader.from_excel(p)

    # Ensure the xlsx file actually exists
    assert os.path.exists(
        xlsx_path), f'No example Excel template provided for {schema_path}'
    reference_template = XlTemplateReader.from_excel(xlsx_path)

    # Check that both templates have the same fields
    compare_templates(schema_path, generated_template, reference_template)

    # Validate the Excel template
    assert reference_template.validate(template)
예제 #12
0
def stage_assay_for_analysis(template_type):
    """
    Simulates an initial assay upload by prismifying the initial assay template object.
    """

    staging_map = {
        "cytof_analysis": "cytof",
        "tumor_normal_pairing": "wes_fastq",
    }

    if not template_type in staging_map:
        return {}

    prelim_assay = staging_map[template_type]

    preassay_xlsx_path = os.path.join(
        TEMPLATE_EXAMPLES_DIR, prelim_assay + "_template.xlsx"
    )
    preassay_xlsx, _ = XlTemplateReader.from_excel(preassay_xlsx_path)
    preassay_template = Template.from_type(prelim_assay)
    prism_res = core.prismify(preassay_xlsx, preassay_template)

    return prism_patch_stage_artifacts(prism_res, prelim_assay)
예제 #13
0
def template(request):
    return Template.from_type(request.param)
예제 #14
0
def test_template_schema_checks():
    schema = {
        "properties": {
            "worksheets": {
                "worksheet_1": {
                    "prism_preamble_object_pointer":
                    "/prism_preamble_object_pointer/0",
                    "preamble_rows": {
                        "preamble_field_1": {
                            "gcs_uri_format": "should not be here"
                        }
                    },
                    "prism_data_object_pointer":
                    "/prism_data_object_pointer/-",
                    "data_columns": {
                        "section_1": {
                            "data_field_1": {
                                "merge_pointer": "/data_field",
                                "type": "number",
                                "is_artifact": True,
                            }
                        }
                    },
                }
            }
        }
    }

    with pytest.raises(
            Exception,
            match=
            "Error in template 'adhoc_test_template'/'worksheet_1': Couldn't load mapping for 'preamble_field_1': Either \"type\".*should be present",
    ):
        template = Template(schema, type="adhoc_test_template")

    schema["properties"]["worksheets"]["worksheet_1"]["preamble_rows"][
        "preamble_field_1"]["type"] = "string"

    with pytest.raises(Exception,
                       match=r"missing.*required.*argument.*merge_pointer"):
        template = Template(schema, type="adhoc_test_template")

    schema["properties"]["worksheets"]["worksheet_1"]["preamble_rows"][
        "preamble_field_1"]["merge_pointer"] = "preamble_field"

    with pytest.raises(Exception,
                       match="gcs_uri_format defined for not is_artifact"):
        template = Template(schema, type="adhoc_test_template")

    del schema["properties"]["worksheets"]["worksheet_1"]["preamble_rows"][
        "preamble_field_1"]["gcs_uri_format"]

    with pytest.raises(Exception, match="Empty gcs_uri_format"):
        template = Template(schema, type="adhoc_test_template")

    schema["properties"]["worksheets"]["worksheet_1"]["data_columns"][
        "section_1"]["data_field_1"]["gcs_uri_format"] = 123

    with pytest.raises(Exception,
                       match=r"Bad gcs_uri_format.*should be dict or str"):
        template = Template(schema, type="adhoc_test_template")

    schema["properties"]["worksheets"]["worksheet_1"]["data_columns"][
        "section_1"]["data_field_1"]["gcs_uri_format"] = {
            "check_errors": "something"
        }

    with pytest.raises(Exception,
                       match="dict type gcs_uri_format should have 'format'"):
        template = Template(schema, type="adhoc_test_template")

    schema["properties"]["worksheets"]["worksheet_1"]["data_columns"][
        "section_1"]["data_field_1"]["gcs_uri_format"] = {
            "check_errors": "something",
            "format": "/some/{thing}"
        }

    template = Template(schema, type="adhoc_test_template")
예제 #15
0
def pbmc_template(pbmc_schema_path):
    return Template.from_json(pbmc_schema_path, SCHEMA_DIR)
예제 #16
0
def prismify(
    xlsx: XlTemplateReader,
    template: Template,
    schema_root: str = SCHEMA_DIR,
    debug: bool = False,
) -> (dict, List[LocalFileUploadEntry], List[Union[Exception, str]]):
    """
    Converts excel file to json object. It also identifies local files
    which need to uploaded to a google bucket and provides some logic
    to help build the bucket url.
    e.g. file list
    [
        {
            'local_path': '/path/to/fwd.fastq',
            'gs_key': '10021/CTTTPPPSS/wes_forward.fastq'
        }
    ]
    Args:
        xlsx: cidc_schemas.template_reader.XlTemplateReader instance
        template: cidc_schemas.template.Template instance
        schema_root: path to the target JSON schema, defaulting to CIDC schemas root
    Returns:
        (tuple):
            arg1: clinical trial object with data parsed from spreadsheet
            arg2: list of `LocalFileUploadEntry`s that describe each file identified:
                LocalFileUploadEntry(
                    local_path = "/local/path/to/a/data/file/parsed/from/template",
                    gs_key = "constructed/relative/to/clinical/trial/GCS/path",
                    upload_placeholder = "random_uuid-for-artifact-upload",
                    metadata_availability = boolean to indicate whether LocalFileUploadEntry should be extracted for metadata files
                )
            arg3: list of errors
    Process:
    * checks out `prism_preamble_object_pointer` which is a "standard"/absolute
    rfc6901 json-pointer from CT root object to a new assay location.
    E.g. for WES it is `/assays/wes/0`, in DeepDiff terms `ct["assays"]["wes"][0]`
    * creates such "parent/preamble" object.
    E.g. for WES an object that corresponds to a wes_assay will be created:
        {
          "assays": {
            "wes": [
              {
                ...    # we're here - this is "preamble" obj = "assay" obj
              }
            ]
          }
        }
    * then processes all "preamble_rows" properties from "..._template.json"
    to fill object's properties. It uses "merge_pointer"s relative to this
    "parent/preamble" object to determine exact location where to set value.
    In most cases it's just "0/field_name". Where "0" denotes that "field_name"
    is a field in the current object.
    With exceptions like - "3/protocol_identifier" which says basically
    "go 3 levels up in the hierarchy and take protocol_identifier field of the root".
    E.g. WES:
        {
          "protocol_identifier": "4412" # from `3/protocol_identifier`
          "assays": {
            "wes": [
              {
                "assay_creator": "DFCI" # from `0/assay_creator`
              }
            ]
          }
        }
    * then it goes in a loop over all "record" rows in .xlsx, and creates
    an object within that "parent" object for each row. These "record-objects"
    are created at "prism_data_object_pointer" location relative to "preamble".

    E.g. for WES: `"prism_data_object_pointer" : "/records/-"`
        {
          "assays": {
            "wes": [
              {
                "assay_creator": "DFCI",
                "records": [
                  {
                    ...    # we're here - this is "record" obj = "assay entry" obj
                  }
                ]
              }
            ]
          }
        }
    NB Minus sign at the end of "/records/-" is a special relative-json-pointer
    notation that means we need to create new object in an 'record' array.
    So it's like if python's `l.append(v)` would've been `l[-] = v`.
    * Prism now uses those "merge_pointer" relative to this "record" object,
    to populate field values of a "record" in the same way as with "preamble".
    E.g. for WES: `"prism_data_object_pointer" : "/records/-"`
        {
          "assays": {
            "wes": [
              {
                "assay_creator": "DFCI",
                "records": [
                  {
                    "cimac_id": ...                 # from "0/cimac_id",
                    "enrichment_vendor_lot": ...    # from "0/enrichment_vendor_lot",
                    "capture_date": ...             # from "0/capture_date",
                  }
                ]
              }
            ]
          }
        }
    * Finally, as there were many "records" object created/populated,
    Prism now uses `prism_preamble_object_schema` to merge all that together
    with respect to `mergeStrategy`es defined in that schema.
    """

    _check_encrypt_init()

    if template.type not in SUPPORTED_TEMPLATES:
        raise NotImplementedError(
            f"{template.type!r} is not supported, only {SUPPORTED_TEMPLATES} are."
        )

    errors_so_far = []

    # get the root CT schema
    root_ct_schema_name = (
        template.schema.get("prism_template_root_object_schema")
        or "clinical_trial.json"
    )
    root_ct_schema = load_and_validate_schema(root_ct_schema_name, schema_root)
    # create the result CT dictionary
    root_ct_obj = {}
    template_root_obj_pointer = template.schema.get(
        "prism_template_root_object_pointer", ""
    )
    if template_root_obj_pointer != "":
        template_root_obj = {}
        _set_val(template_root_obj_pointer, template_root_obj, root_ct_obj)
    else:
        template_root_obj = root_ct_obj

    # and merger for it
    root_ct_merger = Merger(root_ct_schema, strategies=PRISM_MERGE_STRATEGIES)
    # and where to collect all local file refs
    collected_files = []

    # loop over spreadsheet worksheets
    for ws_name, ws in xlsx.grouped_rows.items():
        logger.debug(f"next worksheet {ws_name!r}")

        # Here we take only first two cells from preamble as key and value respectfully,
        # lowering keys to match template schema definitions.
        preamble_context = dict(
            (r.values[0].lower(), r.values[1]) for r in ws.get(RowType.PREAMBLE, [])
        )
        # We need this full "preamble dict" (all key-value pairs) prior to processing
        # properties from data_columns or preamble wrt template schema definitions, because
        # there can be a 'gcs_uri_format' that needs to have access to all values.

        templ_ws = template.schema["properties"]["worksheets"].get(ws_name)
        if not templ_ws:
            if ws_name in template.ignored_worksheets:
                continue

            errors_so_far.append(f"Unexpected worksheet {ws_name!r}.")
            continue

        preamble_object_schema = load_and_validate_schema(
            templ_ws.get("prism_preamble_object_schema", root_ct_schema_name),
            schema_root,
        )
        preamble_merger = Merger(
            preamble_object_schema, strategies=PRISM_MERGE_STRATEGIES
        )
        preamble_object_pointer = templ_ws.get("prism_preamble_object_pointer", "")
        data_object_pointer = templ_ws["prism_data_object_pointer"]

        # creating preamble obj
        preamble_obj = {}

        # Processing data rows first
        data = ws[RowType.DATA]
        if data:
            # get the data
            headers = ws[RowType.HEADER][0]

            # for row in data:
            for row in data:

                logging.debug(f"  next data row {row!r}")

                # creating data obj
                data_obj = {}
                copy_of_preamble = {}
                _set_val(
                    data_object_pointer,
                    data_obj,
                    copy_of_preamble,
                    template_root_obj,
                    preamble_object_pointer,
                )

                # We create this "data record dict" (all key-value pairs) prior to processing
                # properties from data_columns wrt template schema definitions, because
                # there can be a 'gcs_uri_format' that needs to have access to all values.
                local_context = dict(
                    zip([h.lower() for h in headers.values], row.values)
                )

                # create dictionary per row
                for key, val in zip(headers.values, row.values):

                    combined_context = dict(local_context, **preamble_context)
                    try:
                        changes, new_files = template.process_field_value(
                            ws_name, key, val, combined_context, _encrypt
                        )
                    except ParsingException as e:
                        errors_so_far.append(e)
                    else:
                        _apply_changes(
                            changes, data_obj, copy_of_preamble, data_object_pointer
                        )
                        collected_files.extend(new_files)

                try:
                    preamble_obj = preamble_merger.merge(preamble_obj, copy_of_preamble)
                except MergeCollisionException as e:
                    # Reformatting exception, because this mismatch happened within one template
                    # and not with some saved stuff.
                    wrapped = e.with_context(row=row.row_num, worksheet=ws_name)
                    errors_so_far.append(wrapped)
                    logger.info(f"MergeCollisionException: {wrapped}")

        # Now processing preamble rows
        logger.debug(f"  preamble for {ws_name!r}")
        for row in ws[RowType.PREAMBLE]:
            k, v, *_ = row.values
            try:
                changes, new_files = template.process_field_value(
                    ws_name, k, v, preamble_context, _encrypt
                )
            except ParsingException as e:
                errors_so_far.append(e)
            else:
                # TODO we might want to use copy+preamble_merger here too,
                # to for complex properties that require mergeStrategy
                _apply_changes(
                    changes,
                    preamble_obj,
                    root_ct_obj,
                    template_root_obj_pointer + preamble_object_pointer,
                )
                collected_files.extend(new_files)

        # Now pushing it up / merging with the whole thing
        copy_of_templ_root = {}
        _set_val(preamble_object_pointer, preamble_obj, copy_of_templ_root)
        logger.debug("merging root objs")
        logger.debug(f" {template_root_obj}")
        logger.debug(f" {copy_of_templ_root}")
        template_root_obj = root_ct_merger.merge(template_root_obj, copy_of_templ_root)
        logger.debug(f"  merged - {template_root_obj}")

    if template_root_obj_pointer != "":
        _set_val(template_root_obj_pointer, template_root_obj, root_ct_obj)
    else:
        root_ct_obj = template_root_obj

    return root_ct_obj, collected_files, errors_so_far
예제 #17
0
 def check_validation_error(schema, msg):
     with pytest.raises(AssertionError) as e:
         Template._validate_worksheet("", schema)
     assert msg in str(e.value)
예제 #18
0
def pbmc_template():
    pbmc_template_path = os.path.join(SCHEMA_DIR, 'templates',
                                      'pbmc_template.json')
    return Template.from_json(pbmc_template_path, SCHEMA_DIR)