def test_missing_row_type(tmp_path): """Test that we get an error on a spreadsheet row that has data but lacks a row type annotation""" path = f"{tmp_path}/test.xlsx" wb = Workbook() wb["Sheet"]["B1"] = "some data" wb.save(path) with pytest.raises(ValidationError, match="No recognized row type found in row Sheet/1"): XlTemplateReader.from_excel(path)
def test_valid(tiny_template): """Test that a known-valid spreadsheet is considered valid""" tiny_valid = { 'TEST_SHEET': [ (RowType.PREAMBLE, 'test_property', 'foo'), (RowType.PREAMBLE, 'test_date', '6/11/12'), (RowType.PREAMBLE, 'test_time', '10:44:61'), (RowType.HEADER, 'test_property', 'test_date', 'test_time'), (RowType.DATA, 'foo', '6/11/12', '10:44:61'), (RowType.DATA, 'foo', '6/12/12', '10:45:61') ] } reader = XlTemplateReader(tiny_valid) assert reader.validate(tiny_template)
def template_example(template, template_example_xlsx_path): # Ensure the xlsx file actually exists assert os.path.exists( template_example_xlsx_path ), f"No example Excel template provided for {template.type}" reference, err = XlTemplateReader.from_excel(template_example_xlsx_path) assert not err return reference
def run(ts_path: str, mif_path: str, he_path: str, outdir: str): """Run and profile a typical metadata validation and merging workload.""" set_prism_encrypt_key("foobar") with profiling("1_prismify_tissue_slide_shipping_manifest", outdir): ts_template = Template.from_type("tissue_slide") ts_spreadsheet, _ = XlTemplateReader.from_excel(ts_path) ts_metadata, _, _ = prismify(ts_spreadsheet, ts_template) ts_metadata["allowed_cohort_names"] = ["Not_reported"] ts_metadata["allowed_collection_event_names"] = ["Baseline"] with profiling("2_prismify_mif_assay_metadata_spreadsheet", outdir): mif_template = Template.from_type("mif") mif_spreadsheet, _ = XlTemplateReader.from_excel(mif_path) mif_metadata, files, _ = prismify(mif_spreadsheet, mif_template) with profiling("3_merge_mif_assay_artifacts_into_mif_metadata_patch", outdir): # tqdm gives us a stdout progress indicator as prism iterates through the array artifact_info = tqdm( [ ArtifactInfo( f.upload_placeholder, f"object/url/{f.upload_placeholder}", "", 0, "", "abcd", ) for i, f in enumerate(files) ] ) mif_metadata, _ = merge_artifacts(mif_metadata, artifact_info) with profiling("4_merge_mif_metadata_with_tissue_slide_metadata", outdir): combined_metadata, _ = merge_clinical_trial_metadata(mif_metadata, ts_metadata) # Don't profile this a second time, since we're only interested # in how long it takes to merge the shipping manifest data into # existing trial metadata he_template = Template.from_type("h_and_e") he_spreadsheet, _ = XlTemplateReader.from_excel(he_path) he_metadata, _, _ = prismify(he_spreadsheet, he_template) with profiling("5_merge_h_and_e_metadata_into_trial", outdir): merge_clinical_trial_metadata(he_metadata, combined_metadata)
def test_valid_from_excel(tiny_template): """Test that the reader can load from a small xlsx file""" # also has spaces in both name and value for both #preamble and #data tiny_xlsx = os.path.join(TEST_DATA_DIR, "tiny_valid_manifest.xlsx") reader, errs = XlTemplateReader.from_excel(tiny_xlsx) assert not errs assert reader.validate(tiny_template) assert reader.template["TEST_SHEET"][0].values[1] == "foo" assert reader.template["TEST_SHEET"][-1].values[0] == "foo"
def test_template(schema_path, xlsx_path, tmpdir): """ Ensure the template schema generates a spreadsheet that looks like the given example, and check that the template example is valid. """ # Load the template and write it to a temporary file template = Template.from_json(schema_path, SCHEMA_DIR) p = tmpdir.join('test_output.xlsx') template.to_excel(p) generated_template = XlTemplateReader.from_excel(p) # Ensure the xlsx file actually exists assert os.path.exists( xlsx_path), f'No example Excel template provided for {schema_path}' reference_template = XlTemplateReader.from_excel(xlsx_path) # Check that both templates have the same fields compare_templates(schema_path, generated_template, reference_template) # Validate the Excel template assert reference_template.validate(template)
def test_valid(tiny_template): """Test that a known-valid spreadsheet is considered valid""" tiny_valid = { "TEST_SHEET": [ TemplateRow(1, RowType.PREAMBLE, ("test_property", "foo")), TemplateRow(2, RowType.PREAMBLE, ("test_date", "6/11/12")), TemplateRow(3, RowType.PREAMBLE, ("test_time", "10:45:01")), TemplateRow(3, RowType.PREAMBLE, ("test_number", "432.1")), TemplateRow(3, RowType.PREAMBLE, ("test_enum", "enum_val_1")), TemplateRow( 4, RowType.HEADER, ("test_property", "test_date", "test_time", "test_number", "test_enum"), ), TemplateRow(5, RowType.DATA, ("foo", "6/11/12", "10:45:01", "4.11", "enum_val_1")), TemplateRow(6, RowType.DATA, ("foo", "6/12/12", "10:46:01", "5.11", "enum_val_2")), ] } reader = XlTemplateReader(tiny_valid) assert reader.validate(tiny_template)
def test_template(template, template_example, template_example_xlsx_path, tmpdir): """ Ensure the template schema generates a spreadsheet that looks like the given example, and check that the template example is valid. """ # write template to a temporary file p = tmpdir.join("test_output.xlsx") template.to_excel(p) generated_template, err = XlTemplateReader.from_excel(p) assert not err reference_template = template_example # Check that both templates have the same fields compare_templates(template.type, generated_template, reference_template) # Validate the Excel template assert reference_template.validate(template) # Ensure the example Excel template isn't valid as any other template for other_template_type in _TEMPLATE_PATH_MAP: if other_template_type == template.type: # don't check it against itself continue elif (other_template_type.startswith("cytof_") and other_template_type.endswith("_analysis") and template.type.startswith("cytof_") and template.type.endswith("_analysis")): # cytof_<trial>_analysis might cross validate which is fine continue other_template = Template.from_type(other_template_type) with pytest.raises(ValidationError): other_template.validate_excel(template_example_xlsx_path) # Ensure that the data dictionary tab in this template doesn't have empty columns generated_xlsx = openpyxl.load_workbook(p) data_dict_ws = generated_xlsx[XlTemplateWriter._data_dict_sheet_name] for col in data_dict_ws.iter_cols(min_col=2, max_col=50, max_row=10, values_only=True): [header, *values] = col if header is None: break assert any(val is not None for val in values)
def test_missing_required_value(tiny_template): """Test that spreadsheet with a missing value marked required raises a validation error""" tiny_missing_value = { 'TEST_SHEET': [ (RowType.HEADER, 'test_property', 'test_date', 'test_time'), (RowType.DATA, None, '6/11/12', '10:44:61'), ] } # tiny_template has no required fields, so this should be valid assert XlTemplateReader(tiny_missing_value).validate(tiny_template) # add a required field tiny_template.template_schema['required'] = ['test_property'] search_error_message(tiny_missing_value, tiny_template, ValidationError, 'empty value for required field')
def stage_assay_for_analysis(template_type): """ Simulates an initial assay upload by prismifying the initial assay template object. """ staging_map = { "cytof_analysis": "cytof", "tumor_normal_pairing": "wes_fastq", } if not template_type in staging_map: return {} prelim_assay = staging_map[template_type] preassay_xlsx_path = os.path.join( TEMPLATE_EXAMPLES_DIR, prelim_assay + "_template.xlsx" ) preassay_xlsx, _ = XlTemplateReader.from_excel(preassay_xlsx_path) preassay_template = Template.from_type(prelim_assay) prism_res = core.prismify(preassay_xlsx, preassay_template) return prism_patch_stage_artifacts(prism_res, prelim_assay)
def filepath_gen(xlsx_path: str, schema: dict, assay_hint: str, verb: bool = False): """ This is a python generator which yields the paths of local files we are expecting to recieve alongsdie the supplied metadata xlsx file. There is bespoke assay specific logic encoded in this function and it will likely change if conventions around what files are expected in a given folder, or what files an assay is expecting. Args: xlsx_path: file on file system to excel file. schema: json schema with all ref resolved assay_hint: string used to help idnetify properties in template. Must be the the root of the template filename i.e. wes_template.json would be wes. verb: boolean indicating verbosity Returns: None, data_obj is modified in place """ # get the un resolved schema template_path = os.path.join(TEMPLATE_DIR, 'metadata', f'{assay_hint}_template.json') with open(template_path) as fin: schema = json.load(fin) # find key in the schema, this notation is # recommended usage of deepdif grep. assuming they # overload the pipe operator to simulate cmd line schema_key = 'artifact_link' ds = schema | grep(schema_key) if 'matched_paths' not in ds: raise KeyError(f'{schema_key} not found in schema') # sort potential matches, shortest is what we want. choices = sorted(ds['matched_paths'], key=len) # create tuples key_lu = {} for c in choices: # get the value and parent of the file link. val, pkey = _deep_get(schema, c) pkey = pkey.upper() key_lu[pkey] = val def _do_stuff(key, val, lu): if key in lu: # make the accession key tmp = lu[key][1] print(tmp) gs_key = tmp["lead_organization_study_id"] gs_key = f'{gs_key}/{tmp["cimac_participant_id"]}' gs_key = f'{gs_key}/{tmp["cimac_sample_id"]}' gs_key = f'{gs_key}/{tmp["cimac_aliquot_id"]}' #print("stuff", key, val, lu[key]) print(gs_key) # read the excel file t = XlTemplateReader.from_excel(xlsx_path) # loop over spreadsheet worksheet_names = t.grouped_rows.keys() for name in worksheet_names: # get the worksheat. ws = t.grouped_rows[name] # Compare preamble rows for row in ws[RowType.PREAMBLE]: _do_stuff(row[0], row[1], key_lu) # move to headers headers = ws[RowType.HEADER][0] # get the data. data = ws[RowType.DATA] for row in data: # create dictionary per row for key, val in zip(headers, row): _do_stuff(key, val, key_lu)
def wrapped(*args, **kwargs): logger.info(f"upload_handler({f.__name__}) started") template, xlsx_file = extract_schema_and_xlsx(allowed_types) errors_so_far = [] try: xlsx, errors = XlTemplateReader.from_excel(xlsx_file) except SchemasValidationError as e: raise BadRequest({"errors": [str(e)]}) logger.info(f"xlsx parsed: {len(errors)} errors") log_multiple_errors(errors) errors_so_far.extend(errors) # Run basic validations on the provided Excel file validations = validate(template, xlsx) logger.info(f"xlsx validated: {len(validations.json['errors'])} errors") log_multiple_errors(validations.json["errors"]) errors_so_far.extend(validations.json["errors"]) md_patch, file_infos, errors = prism.prismify(xlsx, template) logger.info( f"prismified: {len(errors)} errors, {len(file_infos)} file_infos" ) log_multiple_errors(errors) errors_so_far.extend(errors) try: trial_id = md_patch[prism.PROTOCOL_ID_FIELD_NAME] except KeyError: errors_so_far.append(f"{prism.PROTOCOL_ID_FIELD_NAME} field not found.") # we can't find trial id so we can't proceed raise BadRequest({"errors": [str(e) for e in errors_so_far]}) trial = TrialMetadata.find_by_trial_id(trial_id) if not trial: errors_so_far.insert( 0, f"Trial with {prism.PROTOCOL_ID_FIELD_NAME}={trial_id!r} not found.", ) # we can't find trial so we can't proceed trying to check_perm or merge raise BadRequest({"errors": [str(e) for e in errors_so_far]}) user = get_current_user() try: check_permissions(user, trial_id, template.type) except Unauthorized as e: errors_so_far.insert(0, e.description) # unauthorized to pull trial so we can't proceed trying to merge raise Unauthorized({"errors": [str(e) for e in errors_so_far]}) # Try to merge assay metadata into the existing clinical trial metadata # Ignoring result as we only want to check there's no validation errors try: merged_md, errors = prism.merge_clinical_trial_metadata( md_patch, trial.metadata_json ) except ValidationError as e: errors_so_far.append(json_validation.format_validation_error(e)) except prism.MergeCollisionException as e: errors_so_far.append(str(e)) except prism.InvalidMergeTargetException as e: # we have an invalid MD stored in db - users can't do anything about it. # So we log it logger.error(f"Internal error with trial {trial_id!r}\n{e}") # and return an error. Though it's not BadRequest but rather an # Internal Server error we report it like that, so it will be displayed raise BadRequest( f"Internal error with {trial_id!r}. Please contact a CIDC Administrator." ) from e logger.info(f"merged: {len(errors)} errors") log_multiple_errors(errors) errors_so_far.extend(errors) if errors_so_far: raise BadRequest({"errors": [str(e) for e in errors_so_far]}) return f( user, trial, template.type, xlsx_file, md_patch, file_infos, *args, **kwargs, )
def test_valid_from_excel(tiny_template): """Test that the reader can load from a small xlsx file""" tiny_xlsx = os.path.join(TEST_DATA_DIR, 'tiny_manifest.xlsx') reader = XlTemplateReader.from_excel(tiny_xlsx) assert reader.validate(tiny_template)
def test_write_wes_analysis_functions(tmpdir): mkdir(tmpdir.join("secret")) test_output_folder = tmpdir test_folder_func = lambda tumor: f"gs://{tumor}" test_path_generator = lambda tumor: tmpdir.join("secret").join( f"{tumor}.xlsx") test_output_path = tmpdir.join("/bar.xlsx") test_protocol_identifier = "test-id" test_folder = "gs://foo" test_normal, test_tumor = "bar", "baz" for i in range(3): if i == 0: write_wes_analysis_template( outfile_path=test_output_path, protocol_identifier=test_protocol_identifier, folder=test_folder, normal_cimac_id=test_normal, tumor_cimac_id=test_tumor, ) reader, errs = XlTemplateReader.from_excel(test_output_path) elif i == 1: write_wes_analysis_batch( output_folder=test_output_folder, protocol_identifier=test_protocol_identifier, folder=test_folder, normal_tumor_pairs=[(test_normal, test_tumor)], ) reader, errs = XlTemplateReader.from_excel( f"{test_output_folder}/{test_tumor}_template.xlsx") elif i == 2: write_wes_analysis_batch( output_folder=test_output_folder, protocol_identifier=test_protocol_identifier, folder=test_folder_func, normal_tumor_pairs=[(test_normal, test_tumor)], path_generator=test_path_generator, ) reader, errs = XlTemplateReader.from_excel( test_path_generator(test_tumor)) assert len(errs) == 0, str(errs) patch_ct, files, errs = prismify(reader, wes_analysis_template) assert len(errs) == 0, str(errs) assert patch_ct.get(PROTOCOL_ID_FIELD_NAME) == test_protocol_identifier runs = patch_ct.get("analysis", {}).get("wes_analysis", {}).get("pair_runs", []) assert len(runs) == 1 run = runs[0] assert run.get("run_id") == test_tumor assert run.get("normal", {}).get("cimac_id") == test_normal assert run.get("tumor", {}).get("cimac_id") == test_tumor for file in files: # LocalFileUploadEntry if i == 2: assert file.local_path.startswith(test_folder_func(test_tumor)) else: assert file.local_path.startswith(test_folder)
def search_error_message(workbook, template, error, msg_fragment): reader = XlTemplateReader(workbook) with pytest.raises(error, match=msg_fragment): res = reader.validate(template)
def test_invalid_from_excel(tiny_template): tiny_xlsx = os.path.join(TEST_DATA_DIR, "tiny_invalid_manifest.xlsx") reader, errs = XlTemplateReader.from_excel(tiny_xlsx) assert len(errs) == 1 assert "expected 2" in errs[0]
def prismify(xlsx_path: str, template_path: str, assay_hint: str = "", verb: bool = False) -> (dict, dict): """ Converts excel file to json object. It also identifies local files which need to uploaded to a google bucket and provides some logic to help build the bucket url. e.g. file list [ { 'local_path': '/path/to/fwd.fastq', 'gs_key': '10021/Patient_1/sample_1/aliquot_1/wes_forward.fastq' } ] Args: xlsx_path: file on file system to excel file. template_path: path on file system relative to schema root of the temaplate assay_hint: string used to help idnetify properties in template. Must be the the root of the template filename i.e. wes_template.json would be wes. verb: boolean indicating verbosity Returns: (tuple): arg1: clinical trial object with data parsed from spreadsheet arg2: list of objects which describe each file identified. """ # get the schema and validator validator = load_and_validate_schema("clinical_trial.json", return_validator=True) schema = validator.schema # this lets us lookup xlsx-to-schema keys key_lu = _load_keylookup(template_path) # this helps us identify file paths in xlsx fp_lu = _build_fplu(assay_hint) # add a special key to track the files fp_lu['special'] = list() # read the excel file t = XlTemplateReader.from_excel(xlsx_path) # create the root dictionary. root = {} data_rows = [] # loop over spreadsheet worksheet_names = t.grouped_rows.keys() for name in worksheet_names: # get the worksheat. ws = t.grouped_rows[name] # Compare preamble rows for row in ws[RowType.PREAMBLE]: # process this property _process_property(row, key_lu, schema, root, assay_hint, fp_lu, verb) # move to headers headers = ws[RowType.HEADER][0] # get the data. data = ws[RowType.DATA] for row in data: # create dictionary per row curd = copy.deepcopy(root) for key, val in zip(headers, row): # process this property _process_property([key, val], key_lu, schema, curd, assay_hint, fp_lu, verb) # save the entry data_rows.append(curd) # create the merger merger = Merger(schema) # iteratively merge. cur_obj = data_rows[0] for i in range(1, len(data_rows)): cur_obj = merger.merge(cur_obj, data_rows[i]) # return the object. return cur_obj, fp_lu['special']