Python XlTemplateReader Examples, cidc_schemas.template_reader.XlTemplateReader Python Examples

Example #1

0

Show file

def test_missing_row_type(tmp_path):
    """Test that we get an error on a spreadsheet row that has data but lacks a row type annotation"""
    path = f"{tmp_path}/test.xlsx"
    wb = Workbook()
    wb["Sheet"]["B1"] = "some data"
    wb.save(path)

    with pytest.raises(ValidationError,
                       match="No recognized row type found in row Sheet/1"):
        XlTemplateReader.from_excel(path)

Example #2

0

Show file

File: test_template_reader.py Project: curlup/cidc-schemas

def test_valid(tiny_template):
    """Test that a known-valid spreadsheet is considered valid"""
    tiny_valid = {
        'TEST_SHEET': [
            (RowType.PREAMBLE, 'test_property', 'foo'),
            (RowType.PREAMBLE, 'test_date', '6/11/12'),
            (RowType.PREAMBLE, 'test_time', '10:44:61'),
            (RowType.HEADER, 'test_property', 'test_date', 'test_time'),
            (RowType.DATA, 'foo', '6/11/12', '10:44:61'),
            (RowType.DATA, 'foo', '6/12/12', '10:45:61')
        ]
    }

    reader = XlTemplateReader(tiny_valid)
    assert reader.validate(tiny_template)

Example #3

0

Show file

def template_example(template, template_example_xlsx_path):
    # Ensure the xlsx file actually exists
    assert os.path.exists(
        template_example_xlsx_path
    ), f"No example Excel template provided for {template.type}"
    reference, err = XlTemplateReader.from_excel(template_example_xlsx_path)
    assert not err
    return reference

Example #4

0

Show file

def run(ts_path: str, mif_path: str, he_path: str, outdir: str):
    """Run and profile a typical metadata validation and merging workload."""
    set_prism_encrypt_key("foobar")

    with profiling("1_prismify_tissue_slide_shipping_manifest", outdir):
        ts_template = Template.from_type("tissue_slide")
        ts_spreadsheet, _ = XlTemplateReader.from_excel(ts_path)
        ts_metadata, _, _ = prismify(ts_spreadsheet, ts_template)
        ts_metadata["allowed_cohort_names"] = ["Not_reported"]
        ts_metadata["allowed_collection_event_names"] = ["Baseline"]

    with profiling("2_prismify_mif_assay_metadata_spreadsheet", outdir):
        mif_template = Template.from_type("mif")
        mif_spreadsheet, _ = XlTemplateReader.from_excel(mif_path)
        mif_metadata, files, _ = prismify(mif_spreadsheet, mif_template)

    with profiling("3_merge_mif_assay_artifacts_into_mif_metadata_patch", outdir):
        # tqdm gives us a stdout progress indicator as prism iterates through the array
        artifact_info = tqdm(
            [
                ArtifactInfo(
                    f.upload_placeholder,
                    f"object/url/{f.upload_placeholder}",
                    "",
                    0,
                    "",
                    "abcd",
                )
                for i, f in enumerate(files)
            ]
        )
        mif_metadata, _ = merge_artifacts(mif_metadata, artifact_info)

    with profiling("4_merge_mif_metadata_with_tissue_slide_metadata", outdir):
        combined_metadata, _ = merge_clinical_trial_metadata(mif_metadata, ts_metadata)

    # Don't profile this a second time, since we're only interested
    # in how long it takes to merge the shipping manifest data into
    # existing trial metadata
    he_template = Template.from_type("h_and_e")
    he_spreadsheet, _ = XlTemplateReader.from_excel(he_path)
    he_metadata, _, _ = prismify(he_spreadsheet, he_template)

    with profiling("5_merge_h_and_e_metadata_into_trial", outdir):
        merge_clinical_trial_metadata(he_metadata, combined_metadata)

Example #5

0

Show file

def test_valid_from_excel(tiny_template):
    """Test that the reader can load from a small xlsx file"""
    # also has spaces in both name and value for both #preamble and #data
    tiny_xlsx = os.path.join(TEST_DATA_DIR, "tiny_valid_manifest.xlsx")
    reader, errs = XlTemplateReader.from_excel(tiny_xlsx)
    assert not errs
    assert reader.validate(tiny_template)
    assert reader.template["TEST_SHEET"][0].values[1] == "foo"
    assert reader.template["TEST_SHEET"][-1].values[0] == "foo"

Example #6

0

Show file

File: test_templates.py Project: curlup/cidc-schemas

def test_template(schema_path, xlsx_path, tmpdir):
    """
    Ensure the template schema generates a spreadsheet that looks like the given example,
    and check that the template example is valid.
    """

    # Load the template and write it to a temporary file
    template = Template.from_json(schema_path, SCHEMA_DIR)
    p = tmpdir.join('test_output.xlsx')
    template.to_excel(p)
    generated_template = XlTemplateReader.from_excel(p)

    # Ensure the xlsx file actually exists
    assert os.path.exists(
        xlsx_path), f'No example Excel template provided for {schema_path}'
    reference_template = XlTemplateReader.from_excel(xlsx_path)

    # Check that both templates have the same fields
    compare_templates(schema_path, generated_template, reference_template)

    # Validate the Excel template
    assert reference_template.validate(template)

Example #7

0

Show file

def test_valid(tiny_template):
    """Test that a known-valid spreadsheet is considered valid"""
    tiny_valid = {
        "TEST_SHEET": [
            TemplateRow(1, RowType.PREAMBLE, ("test_property", "foo")),
            TemplateRow(2, RowType.PREAMBLE, ("test_date", "6/11/12")),
            TemplateRow(3, RowType.PREAMBLE, ("test_time", "10:45:01")),
            TemplateRow(3, RowType.PREAMBLE, ("test_number", "432.1")),
            TemplateRow(3, RowType.PREAMBLE, ("test_enum", "enum_val_1")),
            TemplateRow(
                4,
                RowType.HEADER,
                ("test_property", "test_date", "test_time", "test_number",
                 "test_enum"),
            ),
            TemplateRow(5, RowType.DATA,
                        ("foo", "6/11/12", "10:45:01", "4.11", "enum_val_1")),
            TemplateRow(6, RowType.DATA,
                        ("foo", "6/12/12", "10:46:01", "5.11", "enum_val_2")),
        ]
    }

    reader = XlTemplateReader(tiny_valid)
    assert reader.validate(tiny_template)

Example #8

0

Show file

def test_template(template, template_example, template_example_xlsx_path,
                  tmpdir):
    """
    Ensure the template schema generates a spreadsheet that looks like the given example,
    and check that the template example is valid.
    """

    # write template to a temporary file
    p = tmpdir.join("test_output.xlsx")
    template.to_excel(p)
    generated_template, err = XlTemplateReader.from_excel(p)
    assert not err

    reference_template = template_example

    # Check that both templates have the same fields
    compare_templates(template.type, generated_template, reference_template)

    # Validate the Excel template
    assert reference_template.validate(template)

    # Ensure the example Excel template isn't valid as any other template
    for other_template_type in _TEMPLATE_PATH_MAP:
        if other_template_type == template.type:
            # don't check it against itself
            continue
        elif (other_template_type.startswith("cytof_")
              and other_template_type.endswith("_analysis")
              and template.type.startswith("cytof_")
              and template.type.endswith("_analysis")):
            # cytof_<trial>_analysis might cross validate which is fine
            continue

        other_template = Template.from_type(other_template_type)
        with pytest.raises(ValidationError):
            other_template.validate_excel(template_example_xlsx_path)

    # Ensure that the data dictionary tab in this template doesn't have empty columns
    generated_xlsx = openpyxl.load_workbook(p)
    data_dict_ws = generated_xlsx[XlTemplateWriter._data_dict_sheet_name]
    for col in data_dict_ws.iter_cols(min_col=2,
                                      max_col=50,
                                      max_row=10,
                                      values_only=True):
        [header, *values] = col
        if header is None:
            break
        assert any(val is not None for val in values)

Example #9

0

Show file

File: test_template_reader.py Project: curlup/cidc-schemas

def test_missing_required_value(tiny_template):
    """Test that spreadsheet with a missing value marked required raises a validation error"""
    tiny_missing_value = {
        'TEST_SHEET': [
            (RowType.HEADER, 'test_property', 'test_date', 'test_time'),
            (RowType.DATA, None, '6/11/12', '10:44:61'),
        ]
    }

    # tiny_template has no required fields, so this should be valid
    assert XlTemplateReader(tiny_missing_value).validate(tiny_template)

    # add a required field
    tiny_template.template_schema['required'] = ['test_property']
    search_error_message(tiny_missing_value,
                         tiny_template, ValidationError, 'empty value for required field')

Example #10

0

Show file

File: test_pipelines.py Project: CIMAC-CIDC/cidc-schemas

def stage_assay_for_analysis(template_type):
    """
    Simulates an initial assay upload by prismifying the initial assay template object.
    """

    staging_map = {
        "cytof_analysis": "cytof",
        "tumor_normal_pairing": "wes_fastq",
    }

    if not template_type in staging_map:
        return {}

    prelim_assay = staging_map[template_type]

    preassay_xlsx_path = os.path.join(
        TEMPLATE_EXAMPLES_DIR, prelim_assay + "_template.xlsx"
    )
    preassay_xlsx, _ = XlTemplateReader.from_excel(preassay_xlsx_path)
    preassay_template = Template.from_type(prelim_assay)
    prism_res = core.prismify(preassay_xlsx, preassay_template)

    return prism_patch_stage_artifacts(prism_res, prelim_assay)

Example #11

0

Show file

def filepath_gen(xlsx_path: str,
                 schema: dict,
                 assay_hint: str,
                 verb: bool = False):
    """
    This is a python generator which yields the paths of local files we are expecting 
    to recieve alongsdie the supplied metadata xlsx file.

    There is bespoke assay specific logic encoded in this function and it will
    likely change if conventions around what files are expected in a given 
    folder, or what files an assay is expecting.

    Args:
        xlsx_path: file on file system to excel file.
        schema: json schema with all ref resolved
        assay_hint: string used to help idnetify properties in template. Must 
                    be the the root of the template filename i.e. 
                    wes_template.json would be wes.
        verb: boolean indicating verbosity

    Returns:
        None, data_obj is modified in place
    """

    # get the un resolved schema
    template_path = os.path.join(TEMPLATE_DIR, 'metadata',
                                 f'{assay_hint}_template.json')
    with open(template_path) as fin:
        schema = json.load(fin)

    # find key in the schema, this notation is
    # recommended usage of deepdif grep. assuming they
    # overload the pipe operator to simulate cmd line
    schema_key = 'artifact_link'
    ds = schema | grep(schema_key)
    if 'matched_paths' not in ds:
        raise KeyError(f'{schema_key} not found in schema')

    # sort potential matches, shortest is what we want.
    choices = sorted(ds['matched_paths'], key=len)

    # create tuples
    key_lu = {}
    for c in choices:

        # get the value and parent of the file link.
        val, pkey = _deep_get(schema, c)
        pkey = pkey.upper()
        key_lu[pkey] = val

    def _do_stuff(key, val, lu):
        if key in lu:
            # make the accession key
            tmp = lu[key][1]
            print(tmp)
            gs_key = tmp["lead_organization_study_id"]
            gs_key = f'{gs_key}/{tmp["cimac_participant_id"]}'
            gs_key = f'{gs_key}/{tmp["cimac_sample_id"]}'
            gs_key = f'{gs_key}/{tmp["cimac_aliquot_id"]}'
            #print("stuff", key, val, lu[key])
            print(gs_key)

    # read the excel file
    t = XlTemplateReader.from_excel(xlsx_path)

    # loop over spreadsheet
    worksheet_names = t.grouped_rows.keys()
    for name in worksheet_names:

        # get the worksheat.
        ws = t.grouped_rows[name]

        # Compare preamble rows
        for row in ws[RowType.PREAMBLE]:

            _do_stuff(row[0], row[1], key_lu)

        # move to headers
        headers = ws[RowType.HEADER][0]

        # get the data.
        data = ws[RowType.DATA]
        for row in data:

            # create dictionary per row
            for key, val in zip(headers, row):

                _do_stuff(key, val, key_lu)

Example #12

0

Show file

File: upload_jobs.py Project: CIMAC-CIDC/cidc-api-gae

        def wrapped(*args, **kwargs):
            logger.info(f"upload_handler({f.__name__}) started")
            template, xlsx_file = extract_schema_and_xlsx(allowed_types)

            errors_so_far = []

            try:
                xlsx, errors = XlTemplateReader.from_excel(xlsx_file)
            except SchemasValidationError as e:
                raise BadRequest({"errors": [str(e)]})
            logger.info(f"xlsx parsed: {len(errors)} errors")
            log_multiple_errors(errors)
            errors_so_far.extend(errors)

            # Run basic validations on the provided Excel file
            validations = validate(template, xlsx)
            logger.info(f"xlsx validated: {len(validations.json['errors'])} errors")
            log_multiple_errors(validations.json["errors"])
            errors_so_far.extend(validations.json["errors"])

            md_patch, file_infos, errors = prism.prismify(xlsx, template)
            logger.info(
                f"prismified: {len(errors)} errors, {len(file_infos)} file_infos"
            )
            log_multiple_errors(errors)
            errors_so_far.extend(errors)

            try:
                trial_id = md_patch[prism.PROTOCOL_ID_FIELD_NAME]
            except KeyError:
                errors_so_far.append(f"{prism.PROTOCOL_ID_FIELD_NAME} field not found.")
                # we can't find trial id so we can't proceed
                raise BadRequest({"errors": [str(e) for e in errors_so_far]})

            trial = TrialMetadata.find_by_trial_id(trial_id)
            if not trial:
                errors_so_far.insert(
                    0,
                    f"Trial with {prism.PROTOCOL_ID_FIELD_NAME}={trial_id!r} not found.",
                )
                # we can't find trial so we can't proceed trying to check_perm or merge
                raise BadRequest({"errors": [str(e) for e in errors_so_far]})

            user = get_current_user()
            try:
                check_permissions(user, trial_id, template.type)
            except Unauthorized as e:
                errors_so_far.insert(0, e.description)
                # unauthorized to pull trial so we can't proceed trying to merge
                raise Unauthorized({"errors": [str(e) for e in errors_so_far]})

            # Try to merge assay metadata into the existing clinical trial metadata
            # Ignoring result as we only want to check there's no validation errors
            try:
                merged_md, errors = prism.merge_clinical_trial_metadata(
                    md_patch, trial.metadata_json
                )
            except ValidationError as e:
                errors_so_far.append(json_validation.format_validation_error(e))
            except prism.MergeCollisionException as e:
                errors_so_far.append(str(e))
            except prism.InvalidMergeTargetException as e:
                # we have an invalid MD stored in db - users can't do anything about it.
                # So we log it
                logger.error(f"Internal error with trial {trial_id!r}\n{e}")
                # and return an error. Though it's not BadRequest but rather an
                # Internal Server error we report it like that, so it will be displayed
                raise BadRequest(
                    f"Internal error with {trial_id!r}. Please contact a CIDC Administrator."
                ) from e
            logger.info(f"merged: {len(errors)} errors")
            log_multiple_errors(errors)
            errors_so_far.extend(errors)

            if errors_so_far:
                raise BadRequest({"errors": [str(e) for e in errors_so_far]})

            return f(
                user,
                trial,
                template.type,
                xlsx_file,
                md_patch,
                file_infos,
                *args,
                **kwargs,
            )

Example #13

0

Show file

File: test_template_reader.py Project: curlup/cidc-schemas

def test_valid_from_excel(tiny_template):
    """Test that the reader can load from a small xlsx file"""
    tiny_xlsx = os.path.join(TEST_DATA_DIR, 'tiny_manifest.xlsx')
    reader = XlTemplateReader.from_excel(tiny_xlsx)
    assert reader.validate(tiny_template)

Example #14

0

Show file

def test_write_wes_analysis_functions(tmpdir):
    mkdir(tmpdir.join("secret"))

    test_output_folder = tmpdir
    test_folder_func = lambda tumor: f"gs://{tumor}"
    test_path_generator = lambda tumor: tmpdir.join("secret").join(
        f"{tumor}.xlsx")

    test_output_path = tmpdir.join("/bar.xlsx")
    test_protocol_identifier = "test-id"
    test_folder = "gs://foo"
    test_normal, test_tumor = "bar", "baz"

    for i in range(3):
        if i == 0:
            write_wes_analysis_template(
                outfile_path=test_output_path,
                protocol_identifier=test_protocol_identifier,
                folder=test_folder,
                normal_cimac_id=test_normal,
                tumor_cimac_id=test_tumor,
            )
            reader, errs = XlTemplateReader.from_excel(test_output_path)
        elif i == 1:
            write_wes_analysis_batch(
                output_folder=test_output_folder,
                protocol_identifier=test_protocol_identifier,
                folder=test_folder,
                normal_tumor_pairs=[(test_normal, test_tumor)],
            )
            reader, errs = XlTemplateReader.from_excel(
                f"{test_output_folder}/{test_tumor}_template.xlsx")
        elif i == 2:
            write_wes_analysis_batch(
                output_folder=test_output_folder,
                protocol_identifier=test_protocol_identifier,
                folder=test_folder_func,
                normal_tumor_pairs=[(test_normal, test_tumor)],
                path_generator=test_path_generator,
            )
            reader, errs = XlTemplateReader.from_excel(
                test_path_generator(test_tumor))

        assert len(errs) == 0, str(errs)

        patch_ct, files, errs = prismify(reader, wes_analysis_template)
        assert len(errs) == 0, str(errs)

        assert patch_ct.get(PROTOCOL_ID_FIELD_NAME) == test_protocol_identifier
        runs = patch_ct.get("analysis", {}).get("wes_analysis",
                                                {}).get("pair_runs", [])
        assert len(runs) == 1
        run = runs[0]
        assert run.get("run_id") == test_tumor
        assert run.get("normal", {}).get("cimac_id") == test_normal
        assert run.get("tumor", {}).get("cimac_id") == test_tumor

        for file in files:  # LocalFileUploadEntry
            if i == 2:
                assert file.local_path.startswith(test_folder_func(test_tumor))
            else:
                assert file.local_path.startswith(test_folder)

Example #15

0

Show file

def search_error_message(workbook, template, error, msg_fragment):
    reader = XlTemplateReader(workbook)
    with pytest.raises(error, match=msg_fragment):
        res = reader.validate(template)

Example #16

0

Show file

def test_invalid_from_excel(tiny_template):
    tiny_xlsx = os.path.join(TEST_DATA_DIR, "tiny_invalid_manifest.xlsx")
    reader, errs = XlTemplateReader.from_excel(tiny_xlsx)
    assert len(errs) == 1
    assert "expected 2" in errs[0]

Example #17

0

Show file

def prismify(xlsx_path: str,
             template_path: str,
             assay_hint: str = "",
             verb: bool = False) -> (dict, dict):
    """
    Converts excel file to json object. It also identifies local files
    which need to uploaded to a google bucket and provides some logic
    to help build the bucket url.

    e.g. file list
    [
        {
            'local_path': '/path/to/fwd.fastq', 
            'gs_key': '10021/Patient_1/sample_1/aliquot_1/wes_forward.fastq'
        }
    ]


    Args:
        xlsx_path: file on file system to excel file.
        template_path: path on file system relative to schema root of the 
                        temaplate
                
        assay_hint: string used to help idnetify properties in template. Must 
                    be the the root of the template filename i.e. 
                    wes_template.json would be wes.
        verb: boolean indicating verbosity

    Returns:
        (tuple):
            arg1: clinical trial object with data parsed from spreadsheet
            arg2: list of objects which describe each file identified.
    """

    # get the schema and validator
    validator = load_and_validate_schema("clinical_trial.json",
                                         return_validator=True)
    schema = validator.schema

    # this lets us lookup xlsx-to-schema keys
    key_lu = _load_keylookup(template_path)

    # this helps us identify file paths in xlsx
    fp_lu = _build_fplu(assay_hint)

    # add a special key to track the files
    fp_lu['special'] = list()

    # read the excel file
    t = XlTemplateReader.from_excel(xlsx_path)

    # create the root dictionary.
    root = {}
    data_rows = []

    # loop over spreadsheet
    worksheet_names = t.grouped_rows.keys()
    for name in worksheet_names:

        # get the worksheat.
        ws = t.grouped_rows[name]

        # Compare preamble rows
        for row in ws[RowType.PREAMBLE]:

            # process this property
            _process_property(row, key_lu, schema, root, assay_hint, fp_lu,
                              verb)

        # move to headers
        headers = ws[RowType.HEADER][0]

        # get the data.
        data = ws[RowType.DATA]
        for row in data:

            # create dictionary per row
            curd = copy.deepcopy(root)
            for key, val in zip(headers, row):

                # process this property
                _process_property([key, val], key_lu, schema, curd, assay_hint,
                                  fp_lu, verb)

            # save the entry
            data_rows.append(curd)

    # create the merger
    merger = Merger(schema)

    # iteratively merge.
    cur_obj = data_rows[0]
    for i in range(1, len(data_rows)):
        cur_obj = merger.merge(cur_obj, data_rows[i])

    # return the object.
    return cur_obj, fp_lu['special']