def test_load_subschema(): """Test that the subschema loading option works as expected.""" schema = load_and_validate_schema("clinical_trial.json") subschema = schema["properties"]["participants"] path = "properties/participants" assert subschema == load_and_validate_schema(f"clinical_trial.json#{path}") assert subschema == load_and_validate_schema( f"clinical_trial.json#/{path}") with pytest.raises(jsonpointer.JsonPointerException): load_and_validate_schema("clinical_trial.json#foo")
def load_schemas() -> dict: """ Load all JSON schemas into a dictionary keyed on the schema directory. Values are dictionaries mapping entity names to loaded and validated entity schemas. """ schemas = {} for root, _, paths in os.walk(SCHEMA_DIR): root_schemas = {} for path in paths: schema_path = os.path.join(root, path) def json_to_html(ref): """Update refs to refer to the URL of the corresponding documentation.""" url = ref.replace('.json', '.html') url = url.replace('properties/', '') url = url.replace('definitions/', '') url = url.replace('/', '.') return {'url': url} schema = load_and_validate_schema(schema_path, SCHEMA_DIR, on_refs=json_to_html) schema_path = path.replace(".json", ".html").replace("/", ".") root_schemas[schema_path] = schema relative_root = root.replace(f"{ROOT_DIR}/", "").replace("/", ".") relative_root = root.replace(SCHEMA_DIR, "").replace("/", ".") relative_root = relative_root.replace(".", "", 1) schemas[relative_root] = root_schemas return schemas
def _fetch_validator(name): schema_root = SCHEMA_DIR schema_path = os.path.join(SCHEMA_DIR, "assays/%s_assay.json" % name) schema = load_and_validate_schema(schema_path, schema_root) # create validator assert schemas are valid. return jsonschema.Draft7Validator(schema)
def test_merge_core(): # create aliquot aliquot = {"cimac_aliquot_id": "1234"} # create the sample. sample = { "cimac_sample_id": "S1234", "site_sample_id": "blank", "aliquots": [aliquot] } # create the participant participant = { "cimac_participant_id": "P1234", "trial_participant_id": "blank", "samples": [sample] } # create the trial ct1 = {"lead_organization_study_id": "test", "participants": [participant]} # create validator assert schemas are valid. validator = load_and_validate_schema("clinical_trial.json", return_validator=True) schema = validator.schema validator.validate(ct1) # create a copy of this, modify participant id ct2 = copy.deepcopy(ct1) ct2['participants'][0]['cimac_participant_id'] = "PABCD" # merge them merger = Merger(schema) ct3 = merger.merge(ct1, ct2) # assert we have two participants and their ids are different. assert len(ct3['participants']) == 2 assert ct3['participants'][0]['cimac_participant_id'] == ct1[ 'participants'][0]['cimac_participant_id'] assert ct3['participants'][1]['cimac_participant_id'] == ct2[ 'participants'][0]['cimac_participant_id'] # now lets add a new sample to one of the participants ct4 = copy.deepcopy(ct3) sample2 = ct4['participants'][0]['samples'][0] sample2['cimac_sample_id'] = 'new_id_1' ct5 = merger.merge(ct3, ct4) assert len(ct5['participants'][0]['samples']) == 2 # now lets add a new aliquot to one of the samples. ct6 = copy.deepcopy(ct5) aliquot2 = ct6['participants'][0]['samples'][0]['aliquots'][0] aliquot2['cimac_aliquot_id'] = 'new_ali_id_1' ct7 = merger.merge(ct5, ct6) assert len(ct7['participants'][0]['samples'][0]['aliquots']) == 2
def test_iter_error_messages(): """Smoke check that _Validator.iter_error_messages returns strings, not ValidationErrors.""" validator = load_and_validate_schema("clinical_trial.json", return_validator=True) errs = list( validator.iter_error_messages({"protocol_identifier": "foo123"})) for err in errs: assert isinstance(err, str)
def _fetch_validator(name): schema_root = SCHEMA_DIR schema_path = os.path.join(SCHEMA_DIR, "%s.json" % name) validator = load_and_validate_schema( schema_path, schema_root, return_validator=True ) return validator
def _fetch_validator(name): schema_root = SCHEMA_DIR schema_path = os.path.join(SCHEMA_DIR, "artifacts/artifact_%s.json" % name) validator = load_and_validate_schema(schema_path, schema_root, return_validator=True) # create validator assert schemas are valid. return validator
def test_recursive_validations(): validator = load_and_validate_schema("a.json", schema_root=TEST_SCHEMA_DIR, return_validator=True) with pytest.raises(jsonschema.ValidationError, match="not of type 'array'"): validator.validate({"a_prop": {"recursive_prop": {}}}) with pytest.raises(jsonschema.ValidationError, match="not of type 'array'"): validator.validate({"a_prop": {"recursive_prop": [{}]}}) with pytest.raises(jsonschema.ValidationError, match="not of type 'array'"): validator.validate({"a_prop": {"recursive_prop": [[{}], [], [[]]]}}) validator.validate({"a_prop": {"recursive_prop": [[[]], [], [[[[]]]]]}})
def test_filepath_gen(): # create validators validator = load_and_validate_schema("clinical_trial.json", return_validator=True) schema = validator.schema # get a specifc template for temp_path, xlsx_path in template_paths(): # extract hint. hint = temp_path.split("/")[-1].replace("_template.json", "") # TODO: only implemented WES parsing... if hint != "wes": continue # parse the spreadsheet and get the file maps ct, file_maps = prismify(xlsx_path, temp_path, assay_hint=hint) # assert we have the right counts. if hint == "wes": # check the number of files present. assert len(file_maps) == 6 # we should have 2 fastq per sample. assert 4 == sum( [1 for x in file_maps if x['gs_key'].count("fastq") > 0]) # we should have 2 tot forward. assert 2 == sum( [1 for x in file_maps if x['gs_key'].count("forward") > 0]) assert 2 == sum( [1 for x in file_maps if x['gs_key'].count("reverse") > 0]) # we should have 2 text files assert 2 == sum( [1 for x in file_maps if x['gs_key'].count("txt") > 0]) # assert works validator.validate(ct)
def test_schema(example_path): validator = load_and_validate_schema("clinical_trial.json", return_validator=True) full_path = os.path.join(*example_path) _, fname = example_path with open(full_path) as file: try: ct_example = json.load(file) except Exception as e: raise Exception(f"Error decoding {example_path}: {e}") try: validator.validate(ct_example) except jsonschema.exceptions.ValidationError as e: raise Exception( f'Failed to validate {fname}:{"["+"][".join(repr(p) for p in e.absolute_path)+"]"} \ \n {e.message} \ \n CT_SCHEMA{"["+"][".join(repr(p) for p in e.absolute_schema_path)+"]"} \ \n instance {e.instance}' )
def test_prism(): # create validators validator = load_and_validate_schema("clinical_trial.json", return_validator=True) schema = validator.schema # get a specifc template for temp_path, xlsx_path in template_paths(): # extract hint. hint = temp_path.split("/")[-1].replace("_template.json", "") # TODO: only implemented WES parsing... if hint != "wes": continue # turn into object. ct, file_maps = prismify(xlsx_path, temp_path, assay_hint=hint) # assert works validator.validate(ct)
def test_trial_core(): # load and validate schema. schema_root = SCHEMA_DIR ct_schema_path = os.path.join(SCHEMA_DIR, "clinical_trial.json") pt_schema_path = os.path.join(SCHEMA_DIR, "participant.json") sm_schema_path = os.path.join(SCHEMA_DIR, "sample.json") al_schema_path = os.path.join(SCHEMA_DIR, "aliquot.json") ct_schema = load_and_validate_schema(ct_schema_path, schema_root) pt_schema = load_and_validate_schema(pt_schema_path, schema_root) sm_schema = load_and_validate_schema(sm_schema_path, schema_root) al_schema = load_and_validate_schema(al_schema_path, schema_root) # create validator assert schemas are valid. ct_validator = jsonschema.Draft7Validator(ct_schema) ct_validator.check_schema(ct_schema) pt_validator = jsonschema.Draft7Validator(pt_schema) pt_validator.check_schema(pt_schema) sm_validator = jsonschema.Draft7Validator(sm_schema) sm_validator.check_schema(sm_schema) al_validator = jsonschema.Draft7Validator(al_schema) al_validator.check_schema(al_schema) # create some aliquots. shipment = { "account_number": "account_number", "assay_priority": "1", "assay_type": "Olink", "courier": "USPS", "date_received": "date_received", "date_shipped": "date_shipped", "manifest_id": "manifest_id", "quality_of_shipment": "Specimen shipment received in good condition", "ship_from": "ship_from", "ship_to": "ship_to", "shipping_condition": "Ice_Pack", "tracking_number": "tracking_number", "receiving_party": "MDA_Wistuba", } aliquot1 = { "slide_number": "99", "aliquot_replacement": "N/A", "aliquot_status": "Other", } al_validator.validate(aliquot1) aliquot2 = { "slide_number": "98", "aliquot_replacement": "N/A", "aliquot_status": "Other", } al_validator.validate(aliquot2) # create some samples. sample1 = { "cimac_id": "CTTTPPP12.00", "parent_sample_id": "ssida", "aliquots": [aliquot1], "collection_event_name": "Baseline", "type_of_primary_container": "Sodium heparin", "sample_location": "---", "type_of_sample": "Other", "sample_volume_units": "Other", "material_used": 1, "material_remaining": 0, "quality_of_sample": "Other", "box_number": "1", } sm_validator.validate(sample1) sample2 = { "cimac_id": "CTTTPPP12.00", "parent_sample_id": "ssidb", "aliquots": [aliquot2], "collection_event_name": "Baseline", "type_of_primary_container": "Sodium heparin", "sample_location": "---", "type_of_sample": "Other", } sm_validator.validate(sample2) # create a bad participant, then make it good. participant = { "cimac_participant_id": "CTTTPPP", "participant_id": "tpid_a", "cohort_name": "Arm_Z", } with pytest.raises(jsonschema.ValidationError): pt_validator.validate(participant) # add samples to the participant. participant["samples"] = [sample1, sample2] pt_validator.validate(participant) # validate the positive version works. clinical_trial = { PROTOCOL_ID_FIELD_NAME: "trial1", "allowed_collection_event_names": ["Baseline"], "allowed_cohort_names": ["Arm_Z"], "participants": [participant], "shipments": [shipment], } ct_validator.validate(clinical_trial) # make it fail participant.pop("cimac_participant_id") with pytest.raises(jsonschema.ValidationError): ct_validator.validate(clinical_trial)
def load(): load_and_validate_schema("clinical_trial.json")
def prismify( xlsx: XlTemplateReader, template: Template, schema_root: str = SCHEMA_DIR, debug: bool = False, ) -> (dict, List[LocalFileUploadEntry], List[Union[Exception, str]]): """ Converts excel file to json object. It also identifies local files which need to uploaded to a google bucket and provides some logic to help build the bucket url. e.g. file list [ { 'local_path': '/path/to/fwd.fastq', 'gs_key': '10021/CTTTPPPSS/wes_forward.fastq' } ] Args: xlsx: cidc_schemas.template_reader.XlTemplateReader instance template: cidc_schemas.template.Template instance schema_root: path to the target JSON schema, defaulting to CIDC schemas root Returns: (tuple): arg1: clinical trial object with data parsed from spreadsheet arg2: list of `LocalFileUploadEntry`s that describe each file identified: LocalFileUploadEntry( local_path = "/local/path/to/a/data/file/parsed/from/template", gs_key = "constructed/relative/to/clinical/trial/GCS/path", upload_placeholder = "random_uuid-for-artifact-upload", metadata_availability = boolean to indicate whether LocalFileUploadEntry should be extracted for metadata files ) arg3: list of errors Process: * checks out `prism_preamble_object_pointer` which is a "standard"/absolute rfc6901 json-pointer from CT root object to a new assay location. E.g. for WES it is `/assays/wes/0`, in DeepDiff terms `ct["assays"]["wes"][0]` * creates such "parent/preamble" object. E.g. for WES an object that corresponds to a wes_assay will be created: { "assays": { "wes": [ { ... # we're here - this is "preamble" obj = "assay" obj } ] } } * then processes all "preamble_rows" properties from "..._template.json" to fill object's properties. It uses "merge_pointer"s relative to this "parent/preamble" object to determine exact location where to set value. In most cases it's just "0/field_name". Where "0" denotes that "field_name" is a field in the current object. With exceptions like - "3/protocol_identifier" which says basically "go 3 levels up in the hierarchy and take protocol_identifier field of the root". E.g. WES: { "protocol_identifier": "4412" # from `3/protocol_identifier` "assays": { "wes": [ { "assay_creator": "DFCI" # from `0/assay_creator` } ] } } * then it goes in a loop over all "record" rows in .xlsx, and creates an object within that "parent" object for each row. These "record-objects" are created at "prism_data_object_pointer" location relative to "preamble". E.g. for WES: `"prism_data_object_pointer" : "/records/-"` { "assays": { "wes": [ { "assay_creator": "DFCI", "records": [ { ... # we're here - this is "record" obj = "assay entry" obj } ] } ] } } NB Minus sign at the end of "/records/-" is a special relative-json-pointer notation that means we need to create new object in an 'record' array. So it's like if python's `l.append(v)` would've been `l[-] = v`. * Prism now uses those "merge_pointer" relative to this "record" object, to populate field values of a "record" in the same way as with "preamble". E.g. for WES: `"prism_data_object_pointer" : "/records/-"` { "assays": { "wes": [ { "assay_creator": "DFCI", "records": [ { "cimac_id": ... # from "0/cimac_id", "enrichment_vendor_lot": ... # from "0/enrichment_vendor_lot", "capture_date": ... # from "0/capture_date", } ] } ] } } * Finally, as there were many "records" object created/populated, Prism now uses `prism_preamble_object_schema` to merge all that together with respect to `mergeStrategy`es defined in that schema. """ _check_encrypt_init() if template.type not in SUPPORTED_TEMPLATES: raise NotImplementedError( f"{template.type!r} is not supported, only {SUPPORTED_TEMPLATES} are." ) errors_so_far = [] # get the root CT schema root_ct_schema_name = ( template.schema.get("prism_template_root_object_schema") or "clinical_trial.json" ) root_ct_schema = load_and_validate_schema(root_ct_schema_name, schema_root) # create the result CT dictionary root_ct_obj = {} template_root_obj_pointer = template.schema.get( "prism_template_root_object_pointer", "" ) if template_root_obj_pointer != "": template_root_obj = {} _set_val(template_root_obj_pointer, template_root_obj, root_ct_obj) else: template_root_obj = root_ct_obj # and merger for it root_ct_merger = Merger(root_ct_schema, strategies=PRISM_MERGE_STRATEGIES) # and where to collect all local file refs collected_files = [] # loop over spreadsheet worksheets for ws_name, ws in xlsx.grouped_rows.items(): logger.debug(f"next worksheet {ws_name!r}") # Here we take only first two cells from preamble as key and value respectfully, # lowering keys to match template schema definitions. preamble_context = dict( (r.values[0].lower(), r.values[1]) for r in ws.get(RowType.PREAMBLE, []) ) # We need this full "preamble dict" (all key-value pairs) prior to processing # properties from data_columns or preamble wrt template schema definitions, because # there can be a 'gcs_uri_format' that needs to have access to all values. templ_ws = template.schema["properties"]["worksheets"].get(ws_name) if not templ_ws: if ws_name in template.ignored_worksheets: continue errors_so_far.append(f"Unexpected worksheet {ws_name!r}.") continue preamble_object_schema = load_and_validate_schema( templ_ws.get("prism_preamble_object_schema", root_ct_schema_name), schema_root, ) preamble_merger = Merger( preamble_object_schema, strategies=PRISM_MERGE_STRATEGIES ) preamble_object_pointer = templ_ws.get("prism_preamble_object_pointer", "") data_object_pointer = templ_ws["prism_data_object_pointer"] # creating preamble obj preamble_obj = {} # Processing data rows first data = ws[RowType.DATA] if data: # get the data headers = ws[RowType.HEADER][0] # for row in data: for row in data: logging.debug(f" next data row {row!r}") # creating data obj data_obj = {} copy_of_preamble = {} _set_val( data_object_pointer, data_obj, copy_of_preamble, template_root_obj, preamble_object_pointer, ) # We create this "data record dict" (all key-value pairs) prior to processing # properties from data_columns wrt template schema definitions, because # there can be a 'gcs_uri_format' that needs to have access to all values. local_context = dict( zip([h.lower() for h in headers.values], row.values) ) # create dictionary per row for key, val in zip(headers.values, row.values): combined_context = dict(local_context, **preamble_context) try: changes, new_files = template.process_field_value( ws_name, key, val, combined_context, _encrypt ) except ParsingException as e: errors_so_far.append(e) else: _apply_changes( changes, data_obj, copy_of_preamble, data_object_pointer ) collected_files.extend(new_files) try: preamble_obj = preamble_merger.merge(preamble_obj, copy_of_preamble) except MergeCollisionException as e: # Reformatting exception, because this mismatch happened within one template # and not with some saved stuff. wrapped = e.with_context(row=row.row_num, worksheet=ws_name) errors_so_far.append(wrapped) logger.info(f"MergeCollisionException: {wrapped}") # Now processing preamble rows logger.debug(f" preamble for {ws_name!r}") for row in ws[RowType.PREAMBLE]: k, v, *_ = row.values try: changes, new_files = template.process_field_value( ws_name, k, v, preamble_context, _encrypt ) except ParsingException as e: errors_so_far.append(e) else: # TODO we might want to use copy+preamble_merger here too, # to for complex properties that require mergeStrategy _apply_changes( changes, preamble_obj, root_ct_obj, template_root_obj_pointer + preamble_object_pointer, ) collected_files.extend(new_files) # Now pushing it up / merging with the whole thing copy_of_templ_root = {} _set_val(preamble_object_pointer, preamble_obj, copy_of_templ_root) logger.debug("merging root objs") logger.debug(f" {template_root_obj}") logger.debug(f" {copy_of_templ_root}") template_root_obj = root_ct_merger.merge(template_root_obj, copy_of_templ_root) logger.debug(f" merged - {template_root_obj}") if template_root_obj_pointer != "": _set_val(template_root_obj_pointer, template_root_obj, root_ct_obj) else: root_ct_obj = template_root_obj return root_ct_obj, collected_files, errors_so_far
def test_assay_merge(): # two wes assays. a1 = { "lead_organization_study_id": "10021", "participants": [{ "samples": [{ "genomic_source": "Tumor", "aliquots": [{ "assay": { "wes": { "assay_creator": "Mount Sinai", "assay_category": "Whole Exome Sequencing (WES)", "enrichment_vendor_kit": "Twist", "library_vendor_kit": "KAPA - Hyper Prep", "sequencer_platform": "Illumina - NextSeq 550", "paired_end_reads": "Paired", "read_length": 100, "records": [{ "library_kit_lot": "lot abc", "enrichment_vendor_lot": "lot 123", "library_prep_date": "2019-05-01 00:00:00", "capture_date": "2019-05-02 00:00:00", "input_ng": 100, "library_yield_ng": 700, "average_insert_size": 250 }] } }, "cimac_aliquot_id": "Aliquot 1" }], "cimac_sample_id": "Sample 1" }], "cimac_participant_id": "Patient 1" }] } # create a2 and modify ids to trigger merge behavior a2 = copy.deepcopy(a1) a2['participants'][0]['samples'][0][ 'cimac_sample_id'] = "something different" # create validator assert schemas are valid. validator = load_and_validate_schema("clinical_trial.json", return_validator=True) schema = validator.schema # merge them merger = Merger(schema) a3 = merger.merge(a1, a2) assert len(a3['participants']) == 1 assert len(a3['participants'])
def test_trial_core(): # load and validate schema. schema_root = SCHEMA_DIR ct_schema_path = os.path.join(SCHEMA_DIR, "clinical_trial.json") pt_schema_path = os.path.join(SCHEMA_DIR, "participant.json") sm_schema_path = os.path.join(SCHEMA_DIR, "sample.json") al_schema_path = os.path.join(SCHEMA_DIR, "aliquot.json") ct_schema = load_and_validate_schema(ct_schema_path, schema_root) pt_schema = load_and_validate_schema(pt_schema_path, schema_root) sm_schema = load_and_validate_schema(sm_schema_path, schema_root) al_schema = load_and_validate_schema(al_schema_path, schema_root) # create validator assert schemas are valid. ct_validator = jsonschema.Draft7Validator(ct_schema) ct_validator.check_schema(ct_schema) pt_validator = jsonschema.Draft7Validator(pt_schema) pt_validator.check_schema(pt_schema) sm_validator = jsonschema.Draft7Validator(sm_schema) sm_validator.check_schema(sm_schema) al_validator = jsonschema.Draft7Validator(al_schema) al_validator.check_schema(al_schema) # create some aliquots. shipment = {"request": "DFCI"} aliquot1 = {"cimac_aliquot_id": "c1d1"} aliquot2 = {"cimac_aliquot_id": "c1d2"} al_validator.validate(aliquot1) al_validator.validate(aliquot2) # create some samples. sample1 = { "cimac_sample_id": "csid1", "site_sample_id": "ssida", "aliquots": [aliquot1] } sample2 = { "cimac_sample_id": "csid12", "site_sample_id": "ssidb", "aliquots": [aliquot2] } sm_validator.validate(sample1) sm_validator.validate(sample2) # create a bad participant, then make it good. participant = { "cimac_participant_id": "cpid_1", "trial_participant_id": "tpid_a" } with pytest.raises(jsonschema.ValidationError): pt_validator.validate(participant) # add samples to the participant. participant["samples"] = [sample1, sample2] pt_validator.validate(participant) # validate the positive version works. clinical_trial = { "lead_organization_study_id": "trial1", "participants": [participant], "shipments": [shipment] } ct_validator.validate(clinical_trial) # make it fail participant.pop('cimac_participant_id') with pytest.raises(jsonschema.ValidationError): ct_validator.validate(clinical_trial)
def ct_validator(): return load_and_validate_schema("clinical_trial.json", return_validator=True)
def test_additionalProperties(): ct_schema = load_and_validate_schema( os.path.join(SCHEMA_DIR, "clinical_trial.json") ) recursive_additionalProperties(ct_schema, "")
def test_schema(schema_path): """Ensure the schema file conforms to JSON schema draft 7""" assert load_and_validate_schema(schema_path)
def test_cytof(): # test artifact sub schema schema_root = SCHEMA_DIR schema_path = os.path.join( SCHEMA_DIR, "assays/cytof_assay_core.json#definitions/input_files" ) schema = load_and_validate_schema(schema_path, schema_root) validator = jsonschema.Draft7Validator(schema) fcs_1 = ARTIFACT_OBJ.copy() fcs_1["data_format"] = "FCS" fcs_2 = ARTIFACT_OBJ.copy() fcs_2["data_format"] = "FCS" fcs_3 = ARTIFACT_OBJ.copy() fcs_3["data_format"] = "FCS" fcs_4 = ARTIFACT_OBJ.copy() fcs_4["data_format"] = "FCS" sample_records = {"processed_fcs": fcs_1} validator.validate(sample_records) # create the cytof object cytof_platform = {"instrument": "dummy"} # create a cytof antibody object. antibodies = [ { "antibody": "CD8", "isotope": "dummy", "dilution": "dummy", "stain_type": "Intracellular", "usage": "Analysis Only", }, { "antibody": "PD-L1", "isotope": "dummy", "dilution": "dummy", "stain_type": "Intracellular", "usage": "Used", }, ] cytof_panel = { "assay_run_id": "run_1", "batch_id": "XYZ", "cytof_antibodies": antibodies, "source_fcs": [fcs_2, fcs_3], } obj = {**ASSAY_CORE, **cytof_platform, **cytof_panel} # merge three dictionaries # create the cytof object fcs_1 = ARTIFACT_OBJ.copy() fcs_1["data_format"] = "FCS" fcs_2 = ARTIFACT_OBJ.copy() fcs_2["data_format"] = "FCS" fcs_3 = ARTIFACT_OBJ.copy() fcs_3["data_format"] = "FCS" assignment = ARTIFACT_OBJ.copy() assignment["data_format"] = "CSV" compartment = ARTIFACT_OBJ.copy() compartment["data_format"] = "CSV" profiling = ARTIFACT_OBJ.copy() profiling["data_format"] = "CSV" cell_count_assignment = ARTIFACT_OBJ.copy() cell_count_assignment["data_format"] = "CSV" cell_count_compartment = ARTIFACT_OBJ.copy() cell_count_compartment["data_format"] = "CSV" cell_count_profiling = ARTIFACT_OBJ.copy() cell_count_profiling["data_format"] = "CSV" report = ARTIFACT_OBJ.copy() report["data_format"] = "ZIP" analysis = ARTIFACT_OBJ.copy() analysis["data_format"] = "ZIP" records = { "cimac_id": "CTTTPPPSA.00", "input_files": {"processed_fcs": fcs_1}, "output_files": { "fcs_file": fcs_1, "assignment": assignment, "compartment": compartment, "profiling": profiling, "cell_counts_assignment": assignment, "cell_counts_compartment": compartment, "cell_counts_profiling": profiling, }, } # add a demo sample-level record. obj["records"] = [records] # create validator assert schemas are valid. validator = _fetch_validator("cytof") validator.validate(obj)
def prismify(xlsx_path: str, template_path: str, assay_hint: str = "", verb: bool = False) -> (dict, dict): """ Converts excel file to json object. It also identifies local files which need to uploaded to a google bucket and provides some logic to help build the bucket url. e.g. file list [ { 'local_path': '/path/to/fwd.fastq', 'gs_key': '10021/Patient_1/sample_1/aliquot_1/wes_forward.fastq' } ] Args: xlsx_path: file on file system to excel file. template_path: path on file system relative to schema root of the temaplate assay_hint: string used to help idnetify properties in template. Must be the the root of the template filename i.e. wes_template.json would be wes. verb: boolean indicating verbosity Returns: (tuple): arg1: clinical trial object with data parsed from spreadsheet arg2: list of objects which describe each file identified. """ # get the schema and validator validator = load_and_validate_schema("clinical_trial.json", return_validator=True) schema = validator.schema # this lets us lookup xlsx-to-schema keys key_lu = _load_keylookup(template_path) # this helps us identify file paths in xlsx fp_lu = _build_fplu(assay_hint) # add a special key to track the files fp_lu['special'] = list() # read the excel file t = XlTemplateReader.from_excel(xlsx_path) # create the root dictionary. root = {} data_rows = [] # loop over spreadsheet worksheet_names = t.grouped_rows.keys() for name in worksheet_names: # get the worksheat. ws = t.grouped_rows[name] # Compare preamble rows for row in ws[RowType.PREAMBLE]: # process this property _process_property(row, key_lu, schema, root, assay_hint, fp_lu, verb) # move to headers headers = ws[RowType.HEADER][0] # get the data. data = ws[RowType.DATA] for row in data: # create dictionary per row curd = copy.deepcopy(root) for key, val in zip(headers, row): # process this property _process_property([key, val], key_lu, schema, curd, assay_hint, fp_lu, verb) # save the entry data_rows.append(curd) # create the merger merger = Merger(schema) # iteratively merge. cur_obj = data_rows[0] for i in range(1, len(data_rows)): cur_obj = merger.merge(cur_obj, data_rows[i]) # return the object. return cur_obj, fp_lu['special']