def test_from_type(): pbmc = Template.from_type("pbmc") assert "Shipment" in pbmc.worksheets assert "Samples" in pbmc.worksheets assert "WES" in Template.from_type("wes_fastq").worksheets with pytest.raises(Exception, match="unknown template type"): Template.from_type("foo")
def test_template_arbitrary_data_section(): schema = { "properties": { "worksheets": { "worksheet_1": { "prism_data_object_pointer": "/prism_data_object_pointer/-", "prism_arbitrary_data_section": "extra_annotations", "prism_arbitrary_data_merge_pointer": "/extra_annotations_sub_object", "data_columns": { "section_1": { "data_field_1": { "merge_pointer": "/data_field", "type": "number", } } }, } } } } template = Template(schema, type="adhoc_arbitrary_data_test_template") # not throwing on expected changes, _ = template.process_field_value("worksheet_1", "data_field_1", "123", {}, {}) assert len(changes) == 1 assert changes[0].pointer == "/data_field" assert changes[0].value == 123.0 # process_field_value DOESN'T throw a ParsingException # on arbitrary, not predefined fields changes, _ = template.process_field_value("worksheet_1", "unexpected_property", 321, {}, {}) assert len(changes) == 1 assert changes[ 0].pointer == "/extra_annotations_sub_object/unexpected_property" assert changes[0].value == 321 # Checking different keys sanitization # TODO - figure out and add more changes, _ = template.process_field_value("worksheet_1", "unexpected '\"property", 321, {}, {}) assert changes[ 0].pointer == "/extra_annotations_sub_object/unexpected '\"property"
def test_worksheet_processing(): """Ensure that worksheet schemas are processed as expected""" worksheet = { "preamble_rows": { # should be converted to lowercase "aAa": {} }, "data_columns": { # shouldn't be converted to lowercase "One": { # should be converted to lowercase "BbB": {} } } } target = { "preamble_rows": { "aaa": {} }, "data_columns": { "One": { "bbb": {} } } } assert Template._process_worksheet(worksheet) == target
def tiny_template(): """A small, valid """ test_property = {'$id': 'success', 'type': 'string'} test_date = {'type': 'string', 'format': 'date'} test_time = {'type': 'string', 'format': 'time'} test_fields = { 'test_property': test_property, 'test_date': test_date, 'test_time': test_time } tiny_template_schema = { '$id': 'tiny_template', 'title': 'Tiny Manifest', 'properties': { 'worksheets': { 'TEST_SHEET': { 'preamble_rows': test_fields, 'data_columns': { 'first table': test_fields, 'another table': test_fields } }, } } } return Template(tiny_template_schema)
def run(ts_path: str, mif_path: str, he_path: str, outdir: str): """Run and profile a typical metadata validation and merging workload.""" set_prism_encrypt_key("foobar") with profiling("1_prismify_tissue_slide_shipping_manifest", outdir): ts_template = Template.from_type("tissue_slide") ts_spreadsheet, _ = XlTemplateReader.from_excel(ts_path) ts_metadata, _, _ = prismify(ts_spreadsheet, ts_template) ts_metadata["allowed_cohort_names"] = ["Not_reported"] ts_metadata["allowed_collection_event_names"] = ["Baseline"] with profiling("2_prismify_mif_assay_metadata_spreadsheet", outdir): mif_template = Template.from_type("mif") mif_spreadsheet, _ = XlTemplateReader.from_excel(mif_path) mif_metadata, files, _ = prismify(mif_spreadsheet, mif_template) with profiling("3_merge_mif_assay_artifacts_into_mif_metadata_patch", outdir): # tqdm gives us a stdout progress indicator as prism iterates through the array artifact_info = tqdm( [ ArtifactInfo( f.upload_placeholder, f"object/url/{f.upload_placeholder}", "", 0, "", "abcd", ) for i, f in enumerate(files) ] ) mif_metadata, _ = merge_artifacts(mif_metadata, artifact_info) with profiling("4_merge_mif_metadata_with_tissue_slide_metadata", outdir): combined_metadata, _ = merge_clinical_trial_metadata(mif_metadata, ts_metadata) # Don't profile this a second time, since we're only interested # in how long it takes to merge the shipping manifest data into # existing trial metadata he_template = Template.from_type("h_and_e") he_spreadsheet, _ = XlTemplateReader.from_excel(he_path) he_metadata, _, _ = prismify(he_spreadsheet, he_template) with profiling("5_merge_h_and_e_metadata_into_trial", outdir): merge_clinical_trial_metadata(he_metadata, combined_metadata)
def template_set(): """ Get the path to every template schema in the schemas/templates directory and their corresponding xlsx example file. """ # Collect template xlsx examples for templ_type in _TEMPLATE_PATH_MAP: xlsx_path = os.path.join(TEMPLATE_EXAMPLES_DIR, f"{templ_type}_template.xlsx") templ = Template.from_type(templ_type) yield (templ, xlsx_path)
def tiny_template(): """A small, valid """ test_property = { "$id": "test_property", "type": "string", "merge_pointer": "test_property", } test_date = { "type": "string", "format": "date", "merge_pointer": "test_date" } test_time = { "type": "string", "format": "time", "merge_pointer": "test_time" } test_enum = { "type": "string", "enum": ["enum_val_1", "enum_val_2"], "merge_pointer": "test_enum", } test_number = {"type": "number", "merge_pointer": "test_number"} test_fields = { "test_property": test_property, "test_date": test_date, "test_time": test_time, "test_number": test_number, "test_enum": test_enum, } tiny_template_schema = { "$id": "tiny_template", "title": "Tiny Manifest", "properties": { "worksheets": { "TEST_SHEET": { "preamble_rows": test_fields, "data_columns": { "first table": test_fields, "another table": test_fields, }, } } }, } return Template(tiny_template_schema, "test_tiny")
def test_template(template, template_example, template_example_xlsx_path, tmpdir): """ Ensure the template schema generates a spreadsheet that looks like the given example, and check that the template example is valid. """ # write template to a temporary file p = tmpdir.join("test_output.xlsx") template.to_excel(p) generated_template, err = XlTemplateReader.from_excel(p) assert not err reference_template = template_example # Check that both templates have the same fields compare_templates(template.type, generated_template, reference_template) # Validate the Excel template assert reference_template.validate(template) # Ensure the example Excel template isn't valid as any other template for other_template_type in _TEMPLATE_PATH_MAP: if other_template_type == template.type: # don't check it against itself continue elif (other_template_type.startswith("cytof_") and other_template_type.endswith("_analysis") and template.type.startswith("cytof_") and template.type.endswith("_analysis")): # cytof_<trial>_analysis might cross validate which is fine continue other_template = Template.from_type(other_template_type) with pytest.raises(ValidationError): other_template.validate_excel(template_example_xlsx_path) # Ensure that the data dictionary tab in this template doesn't have empty columns generated_xlsx = openpyxl.load_workbook(p) data_dict_ws = generated_xlsx[XlTemplateWriter._data_dict_sheet_name] for col in data_dict_ws.iter_cols(min_col=2, max_col=50, max_row=10, values_only=True): [header, *values] = col if header is None: break assert any(val is not None for val in values)
def test_process_field_value(): schema = { "properties": { "worksheets": { "worksheet_1": { "prism_preamble_object_pointer": "/prism_preamble_object_pointer/0", "preamble_rows": { "preamble_field_1": { "merge_pointer": "/preamble_field", "type": "string", } }, "prism_data_object_pointer": "/prism_data_object_pointer/-", "data_columns": { "section_1": { "data_field_1": { "merge_pointer": "/data_field", "type": "number", } } }, } } } } template = Template(schema, type="adhoc_test_template") # process_field_value throws a ParsingException on properties missing from the key lookup dict with pytest.raises(ParsingException, match="Unexpected property"): template.process_field_value("worksheet_1", "unexpected_prop", "123", {}, {}) with pytest.raises(ParsingException, match="Unexpected worksheet"): template.process_field_value("unexpected_worksheet", "whatever", "123", {}, {}) # not throwing on expected template.process_field_value("worksheet_1", "data_field_1", "123", {}, {})
def extract_schema_and_xlsx(allowed_types: List[str]) -> Tuple[str, BinaryIO]: """ Validate that a request has the required structure, then extract the schema id and template file from the request. The request must have a multipart/form body with one field "schema" referencing a valid schema id and another field "template" with an attached .xlsx file. Raises: BadRequest: if the above requirements aren't satisfied Returns: Tuple[Template, BinaryIO]: template, and the open xlsx file """ # If there is no form attribute on the request object, # then either one wasn't supplied, or it was malformed if not request.form: raise BadRequest( "Expected form content in request body, or failed to parse form content" ) # If we have a form, check that the expected template file exists if "template" not in request.files: raise BadRequest("Expected a template file in request body") # Check that the template file appears to be a .xlsx file xlsx_file = request.files["template"] if xlsx_file.filename and not is_xlsx(xlsx_file.filename): raise BadRequest("Expected a .xlsx file") # Check that a schema id was provided and that a corresponding schema exists schema_id = request.form.get("schema") if not schema_id: raise BadRequest("Expected a form entry for 'schema'") schema_id = schema_id.lower() # Check that the schema type is allowed if schema_id not in allowed_types: raise BadRequest( f"Schema type '{schema_id}' is not supported for this endpoint. Available options: {allowed_types}" ) template = Template.from_type(schema_id) return template, xlsx_file
def test_template(schema_path, xlsx_path, tmpdir): """ Ensure the template schema generates a spreadsheet that looks like the given example, and check that the template example is valid. """ # Load the template and write it to a temporary file template = Template.from_json(schema_path, SCHEMA_DIR) p = tmpdir.join('test_output.xlsx') template.to_excel(p) generated_template = XlTemplateReader.from_excel(p) # Ensure the xlsx file actually exists assert os.path.exists( xlsx_path), f'No example Excel template provided for {schema_path}' reference_template = XlTemplateReader.from_excel(xlsx_path) # Check that both templates have the same fields compare_templates(schema_path, generated_template, reference_template) # Validate the Excel template assert reference_template.validate(template)
def stage_assay_for_analysis(template_type): """ Simulates an initial assay upload by prismifying the initial assay template object. """ staging_map = { "cytof_analysis": "cytof", "tumor_normal_pairing": "wes_fastq", } if not template_type in staging_map: return {} prelim_assay = staging_map[template_type] preassay_xlsx_path = os.path.join( TEMPLATE_EXAMPLES_DIR, prelim_assay + "_template.xlsx" ) preassay_xlsx, _ = XlTemplateReader.from_excel(preassay_xlsx_path) preassay_template = Template.from_type(prelim_assay) prism_res = core.prismify(preassay_xlsx, preassay_template) return prism_patch_stage_artifacts(prism_res, prelim_assay)
def template(request): return Template.from_type(request.param)
def test_template_schema_checks(): schema = { "properties": { "worksheets": { "worksheet_1": { "prism_preamble_object_pointer": "/prism_preamble_object_pointer/0", "preamble_rows": { "preamble_field_1": { "gcs_uri_format": "should not be here" } }, "prism_data_object_pointer": "/prism_data_object_pointer/-", "data_columns": { "section_1": { "data_field_1": { "merge_pointer": "/data_field", "type": "number", "is_artifact": True, } } }, } } } } with pytest.raises( Exception, match= "Error in template 'adhoc_test_template'/'worksheet_1': Couldn't load mapping for 'preamble_field_1': Either \"type\".*should be present", ): template = Template(schema, type="adhoc_test_template") schema["properties"]["worksheets"]["worksheet_1"]["preamble_rows"][ "preamble_field_1"]["type"] = "string" with pytest.raises(Exception, match=r"missing.*required.*argument.*merge_pointer"): template = Template(schema, type="adhoc_test_template") schema["properties"]["worksheets"]["worksheet_1"]["preamble_rows"][ "preamble_field_1"]["merge_pointer"] = "preamble_field" with pytest.raises(Exception, match="gcs_uri_format defined for not is_artifact"): template = Template(schema, type="adhoc_test_template") del schema["properties"]["worksheets"]["worksheet_1"]["preamble_rows"][ "preamble_field_1"]["gcs_uri_format"] with pytest.raises(Exception, match="Empty gcs_uri_format"): template = Template(schema, type="adhoc_test_template") schema["properties"]["worksheets"]["worksheet_1"]["data_columns"][ "section_1"]["data_field_1"]["gcs_uri_format"] = 123 with pytest.raises(Exception, match=r"Bad gcs_uri_format.*should be dict or str"): template = Template(schema, type="adhoc_test_template") schema["properties"]["worksheets"]["worksheet_1"]["data_columns"][ "section_1"]["data_field_1"]["gcs_uri_format"] = { "check_errors": "something" } with pytest.raises(Exception, match="dict type gcs_uri_format should have 'format'"): template = Template(schema, type="adhoc_test_template") schema["properties"]["worksheets"]["worksheet_1"]["data_columns"][ "section_1"]["data_field_1"]["gcs_uri_format"] = { "check_errors": "something", "format": "/some/{thing}" } template = Template(schema, type="adhoc_test_template")
def pbmc_template(pbmc_schema_path): return Template.from_json(pbmc_schema_path, SCHEMA_DIR)
def prismify( xlsx: XlTemplateReader, template: Template, schema_root: str = SCHEMA_DIR, debug: bool = False, ) -> (dict, List[LocalFileUploadEntry], List[Union[Exception, str]]): """ Converts excel file to json object. It also identifies local files which need to uploaded to a google bucket and provides some logic to help build the bucket url. e.g. file list [ { 'local_path': '/path/to/fwd.fastq', 'gs_key': '10021/CTTTPPPSS/wes_forward.fastq' } ] Args: xlsx: cidc_schemas.template_reader.XlTemplateReader instance template: cidc_schemas.template.Template instance schema_root: path to the target JSON schema, defaulting to CIDC schemas root Returns: (tuple): arg1: clinical trial object with data parsed from spreadsheet arg2: list of `LocalFileUploadEntry`s that describe each file identified: LocalFileUploadEntry( local_path = "/local/path/to/a/data/file/parsed/from/template", gs_key = "constructed/relative/to/clinical/trial/GCS/path", upload_placeholder = "random_uuid-for-artifact-upload", metadata_availability = boolean to indicate whether LocalFileUploadEntry should be extracted for metadata files ) arg3: list of errors Process: * checks out `prism_preamble_object_pointer` which is a "standard"/absolute rfc6901 json-pointer from CT root object to a new assay location. E.g. for WES it is `/assays/wes/0`, in DeepDiff terms `ct["assays"]["wes"][0]` * creates such "parent/preamble" object. E.g. for WES an object that corresponds to a wes_assay will be created: { "assays": { "wes": [ { ... # we're here - this is "preamble" obj = "assay" obj } ] } } * then processes all "preamble_rows" properties from "..._template.json" to fill object's properties. It uses "merge_pointer"s relative to this "parent/preamble" object to determine exact location where to set value. In most cases it's just "0/field_name". Where "0" denotes that "field_name" is a field in the current object. With exceptions like - "3/protocol_identifier" which says basically "go 3 levels up in the hierarchy and take protocol_identifier field of the root". E.g. WES: { "protocol_identifier": "4412" # from `3/protocol_identifier` "assays": { "wes": [ { "assay_creator": "DFCI" # from `0/assay_creator` } ] } } * then it goes in a loop over all "record" rows in .xlsx, and creates an object within that "parent" object for each row. These "record-objects" are created at "prism_data_object_pointer" location relative to "preamble". E.g. for WES: `"prism_data_object_pointer" : "/records/-"` { "assays": { "wes": [ { "assay_creator": "DFCI", "records": [ { ... # we're here - this is "record" obj = "assay entry" obj } ] } ] } } NB Minus sign at the end of "/records/-" is a special relative-json-pointer notation that means we need to create new object in an 'record' array. So it's like if python's `l.append(v)` would've been `l[-] = v`. * Prism now uses those "merge_pointer" relative to this "record" object, to populate field values of a "record" in the same way as with "preamble". E.g. for WES: `"prism_data_object_pointer" : "/records/-"` { "assays": { "wes": [ { "assay_creator": "DFCI", "records": [ { "cimac_id": ... # from "0/cimac_id", "enrichment_vendor_lot": ... # from "0/enrichment_vendor_lot", "capture_date": ... # from "0/capture_date", } ] } ] } } * Finally, as there were many "records" object created/populated, Prism now uses `prism_preamble_object_schema` to merge all that together with respect to `mergeStrategy`es defined in that schema. """ _check_encrypt_init() if template.type not in SUPPORTED_TEMPLATES: raise NotImplementedError( f"{template.type!r} is not supported, only {SUPPORTED_TEMPLATES} are." ) errors_so_far = [] # get the root CT schema root_ct_schema_name = ( template.schema.get("prism_template_root_object_schema") or "clinical_trial.json" ) root_ct_schema = load_and_validate_schema(root_ct_schema_name, schema_root) # create the result CT dictionary root_ct_obj = {} template_root_obj_pointer = template.schema.get( "prism_template_root_object_pointer", "" ) if template_root_obj_pointer != "": template_root_obj = {} _set_val(template_root_obj_pointer, template_root_obj, root_ct_obj) else: template_root_obj = root_ct_obj # and merger for it root_ct_merger = Merger(root_ct_schema, strategies=PRISM_MERGE_STRATEGIES) # and where to collect all local file refs collected_files = [] # loop over spreadsheet worksheets for ws_name, ws in xlsx.grouped_rows.items(): logger.debug(f"next worksheet {ws_name!r}") # Here we take only first two cells from preamble as key and value respectfully, # lowering keys to match template schema definitions. preamble_context = dict( (r.values[0].lower(), r.values[1]) for r in ws.get(RowType.PREAMBLE, []) ) # We need this full "preamble dict" (all key-value pairs) prior to processing # properties from data_columns or preamble wrt template schema definitions, because # there can be a 'gcs_uri_format' that needs to have access to all values. templ_ws = template.schema["properties"]["worksheets"].get(ws_name) if not templ_ws: if ws_name in template.ignored_worksheets: continue errors_so_far.append(f"Unexpected worksheet {ws_name!r}.") continue preamble_object_schema = load_and_validate_schema( templ_ws.get("prism_preamble_object_schema", root_ct_schema_name), schema_root, ) preamble_merger = Merger( preamble_object_schema, strategies=PRISM_MERGE_STRATEGIES ) preamble_object_pointer = templ_ws.get("prism_preamble_object_pointer", "") data_object_pointer = templ_ws["prism_data_object_pointer"] # creating preamble obj preamble_obj = {} # Processing data rows first data = ws[RowType.DATA] if data: # get the data headers = ws[RowType.HEADER][0] # for row in data: for row in data: logging.debug(f" next data row {row!r}") # creating data obj data_obj = {} copy_of_preamble = {} _set_val( data_object_pointer, data_obj, copy_of_preamble, template_root_obj, preamble_object_pointer, ) # We create this "data record dict" (all key-value pairs) prior to processing # properties from data_columns wrt template schema definitions, because # there can be a 'gcs_uri_format' that needs to have access to all values. local_context = dict( zip([h.lower() for h in headers.values], row.values) ) # create dictionary per row for key, val in zip(headers.values, row.values): combined_context = dict(local_context, **preamble_context) try: changes, new_files = template.process_field_value( ws_name, key, val, combined_context, _encrypt ) except ParsingException as e: errors_so_far.append(e) else: _apply_changes( changes, data_obj, copy_of_preamble, data_object_pointer ) collected_files.extend(new_files) try: preamble_obj = preamble_merger.merge(preamble_obj, copy_of_preamble) except MergeCollisionException as e: # Reformatting exception, because this mismatch happened within one template # and not with some saved stuff. wrapped = e.with_context(row=row.row_num, worksheet=ws_name) errors_so_far.append(wrapped) logger.info(f"MergeCollisionException: {wrapped}") # Now processing preamble rows logger.debug(f" preamble for {ws_name!r}") for row in ws[RowType.PREAMBLE]: k, v, *_ = row.values try: changes, new_files = template.process_field_value( ws_name, k, v, preamble_context, _encrypt ) except ParsingException as e: errors_so_far.append(e) else: # TODO we might want to use copy+preamble_merger here too, # to for complex properties that require mergeStrategy _apply_changes( changes, preamble_obj, root_ct_obj, template_root_obj_pointer + preamble_object_pointer, ) collected_files.extend(new_files) # Now pushing it up / merging with the whole thing copy_of_templ_root = {} _set_val(preamble_object_pointer, preamble_obj, copy_of_templ_root) logger.debug("merging root objs") logger.debug(f" {template_root_obj}") logger.debug(f" {copy_of_templ_root}") template_root_obj = root_ct_merger.merge(template_root_obj, copy_of_templ_root) logger.debug(f" merged - {template_root_obj}") if template_root_obj_pointer != "": _set_val(template_root_obj_pointer, template_root_obj, root_ct_obj) else: root_ct_obj = template_root_obj return root_ct_obj, collected_files, errors_so_far
def check_validation_error(schema, msg): with pytest.raises(AssertionError) as e: Template._validate_worksheet("", schema) assert msg in str(e.value)
def pbmc_template(): pbmc_template_path = os.path.join(SCHEMA_DIR, 'templates', 'pbmc_template.json') return Template.from_json(pbmc_template_path, SCHEMA_DIR)