Exemple #1
0
def test_read_metadata_file_whitespace_stripping():
    """Tests that whitespace is properly stripped when reading a metadata
    file.
    """

    ws = os.path.join("qurro", "tests", "input", "metadata_tests",
                      "whitespace.tsv")
    ws_df = replace_nan(read_metadata_file(ws))

    # Following stuff is *mostly* copied from
    # test_read_metadata_file_complex(), with some added tweaks

    # Check that dtypes match up (should all be object)
    assert ws_df.index.dtype == "object"
    for col in ws_df.columns:
        assert ws_df[col].dtype == "object"

    # Check that index IDs were treated as strings (issue #116 on the Qurro
    # GitHub page)
    assert ws_df.index.equals(
        Index(["01", "02", "03", "04", "05", "06", "07", "08"]))

    assert ws_df.at["01", "Metadata1"] is None
    assert ws_df.at["01", "Metadata2"] is None
    assert ws_df.at["01", "Metadata3"] is None

    assert ws_df.at["02", "Metadata1"] == "4"
    assert ws_df.at["02", "Metadata2"] == "'5'"
    # The leading spaces in this value should be ignored
    assert ws_df.at["02", "Metadata3"] == "MISSING LOL"

    assert ws_df.at["03", "Metadata1"] is None
    assert ws_df.at["03", "Metadata2"] == "8"
    assert ws_df.at["03", "Metadata3"] == "9"

    assert ws_df.at["04", "Metadata1"] == "10"
    assert ws_df.at["04", "Metadata2"] == "null"
    assert ws_df.at["04", "Metadata3"] == "12"

    assert ws_df.at["05", "Metadata1"] == "13"
    assert ws_df.at["05", "Metadata2"] is None
    assert ws_df.at["05", "Metadata3"] is None

    assert ws_df.at["06", "Metadata1"] == "16"
    # The trailing spaces in this value should be ignored
    assert ws_df.at["06", "Metadata2"] == "17"
    # This entire value should be treated as equivalent to a ""
    assert ws_df.at["06", "Metadata3"] is None

    assert ws_df.at["07", "Metadata1"] == "NaN"
    # This entire value should be treated as equivalent to a ""
    assert ws_df.at["07", "Metadata2"] is None
    assert ws_df.at["07", "Metadata3"] == "21"

    assert ws_df.at["08", "Metadata1"] == "22"
    assert ws_df.at["08", "Metadata2"] is None
    assert ws_df.at["08", "Metadata3"] == "Infinity"

    assert_frame_equal(ws_df,
                       replace_nan(qiime2.Metadata.load(ws).to_dataframe()))
def test_read_metadata_file_complex():
    """Tests some corner cases in replace_nan(read_metadata_file())."""

    weird = os.path.join(
        "qurro", "tests", "input", "metadata_tests", "weird_metadata.tsv"
    )
    weird_df = replace_nan(read_metadata_file(weird))

    # Check that dtypes match up (should all be object)
    assert weird_df.index.dtype == "object"
    for col in weird_df.columns:
        assert weird_df[col].dtype == "object"

    # Check that index IDs were treated as strings (issue #116 on the Qurro
    # GitHub page)
    assert weird_df.index.equals(
        Index(["01", "02", "03", "04", "05", "06", "07", "08"])
    )

    # Check that all of the weird stuff in this file was correctly handled
    # (i.e. all ""s were converted to Nones, and everything else is preserved
    # as a string)
    assert weird_df.at["01", "Metadata1"] is None
    assert weird_df.at["01", "Metadata2"] is None
    assert weird_df.at["01", "Metadata3"] is None

    assert weird_df.at["02", "Metadata1"] == "4"
    assert weird_df.at["02", "Metadata2"] == "'5'"
    assert weird_df.at["02", "Metadata3"] == "MISSING LOL"

    assert weird_df.at["03", "Metadata1"] is None
    assert weird_df.at["03", "Metadata2"] == "8"
    assert weird_df.at["03", "Metadata3"] == "9"

    assert weird_df.at["04", "Metadata1"] == "10"
    assert weird_df.at["04", "Metadata2"] == "null"
    assert weird_df.at["04", "Metadata3"] == "12"

    assert weird_df.at["05", "Metadata1"] == "13"
    assert weird_df.at["05", "Metadata2"] is None
    assert weird_df.at["05", "Metadata3"] is None

    assert weird_df.at["06", "Metadata1"] == "16"
    assert weird_df.at["06", "Metadata2"] == "17"
    assert weird_df.at["06", "Metadata3"] is None

    assert weird_df.at["07", "Metadata1"] == "NaN"
    assert weird_df.at["07", "Metadata2"] is None
    assert weird_df.at["07", "Metadata3"] == "21"

    assert weird_df.at["08", "Metadata1"] == "22"
    assert weird_df.at["08", "Metadata2"] is None
    assert weird_df.at["08", "Metadata3"] == "Infinity"

    assert_frame_equal(
        weird_df, replace_nan(qiime2.Metadata.load(weird).to_dataframe())
    )
Exemple #3
0
def test_replace_nan():
    """Tests replace_nan()."""

    # Basic case: other data are ints
    df = DataFrame({"x": [1, np.NaN], "y": [3, 4]}, index=["a", "b"])
    dfC = DataFrame({"x": [1, None], "y": [3, 4]}, index=["a", "b"])
    assert_frame_equal(dfC, replace_nan(df), check_dtype=False)

    # Other data are strs
    df2 = DataFrame(
        {"x": ["abc", np.NaN], "y": ["ghi", "jkl"]}, index=["a", "b"]
    )
    dfC2 = DataFrame(
        {"x": ["abc", None], "y": ["ghi", "jkl"]}, index=["a", "b"]
    )
    assert_frame_equal(dfC2, replace_nan(df2), check_dtype=False)

    # Entire Series of NaNs
    df3 = DataFrame(
        {"x": [np.NaN, np.NaN], "y": ["ghi", "jkl"]}, index=["a", "b"]
    )
    dfC3 = DataFrame(
        {"x": [None, None], "y": ["ghi", "jkl"]}, index=["a", "b"]
    )
    assert_frame_equal(dfC3, replace_nan(df3), check_dtype=False)

    # Entire DataFrame of NaNs
    df4 = DataFrame(
        {"x": [np.NaN, np.NaN], "y": [np.NaN, np.NaN]}, index=["a", "b"]
    )
    dfC4 = DataFrame({"x": [None, None], "y": [None, None]}, index=["a", "b"])
    assert_frame_equal(dfC4, replace_nan(df4), check_dtype=False)

    # If there are already Nones inside the DF for some reason (should never be
    # the case, but might as well be safe and check this)
    df5 = DataFrame(
        {"x": [np.NaN, None], "y": [np.NaN, np.NaN]}, index=["a", "b"]
    )
    dfC5 = DataFrame({"x": [None, None], "y": [None, None]}, index=["a", "b"])
    assert_frame_equal(dfC5, replace_nan(df5), check_dtype=False)

    # Case where the user specifies an alternate value to replace NaNs with
    df6 = DataFrame(
        {"x": [np.NaN, 3], "y": [np.NaN, np.NaN]}, index=["a", "b"]
    )
    dfC6 = DataFrame({"x": ["lol", 3], "y": ["lol", "lol"]}, index=["a", "b"])
    assert_frame_equal(dfC6, replace_nan(df6, "lol"), check_dtype=False)
Exemple #4
0
def validate_sample_plot_json(
    biom_table_loc, metadata_loc, sample_json, count_json
):
    assert sample_json["mark"] == {"type": "circle"}
    assert sample_json["title"] == "Samples"
    basic_vegalite_json_validation(sample_json)
    dn = sample_json["data"]["name"]

    # Assert that sample metadata fields are in alphabetical order, ignoring
    # case. As in Qurro's code, this solution for sorting this way is based on
    # this article:
    # https://www.afternerd.com/blog/python-sort-list/#sort-strings-case-insensitive
    sm_fields = sample_json["datasets"]["qurro_sample_metadata_fields"]
    assert sorted(sm_fields, key=str.lower) == sm_fields

    # Check that each sample's metadata in the sample plot JSON matches with
    # its actual metadata.
    # NOTE: here we make the assumption that all samples are non-empty.
    # If we start using integration test data with empty samples, then we'll
    # need to revise this function to do something akin to what
    # validate_rank_plot_json() does above to ensure that empty features are
    # filtered out.
    sample_metadata = replace_nan(read_metadata_file(metadata_loc))
    for sample in sample_json["datasets"][dn]:

        sample_id = sample["Sample ID"]

        for metadata_col in sample_metadata.columns:
            expected_md = sample_metadata.at[sample_id, metadata_col]
            actual_md = sample[metadata_col]

            try:
                # Either these values are equal, *or* this was a QIIME 2
                # integration test (in which case the metadata files were
                # loaded as qiime2.Metadata objects) and certain columns'
                # values have been treated as numeric (which is fine, but this
                # might result in some things like a value of 53 being
                # interpreted as a value of 53.0 to qiime2.Metadata -- this
                # isn't a problem, so if the first check of equality fails we
                # try a looser check using approx() and float().
                assert expected_md == actual_md or float(
                    expected_md
                ) == approx(float(actual_md))
            except AssertionError:
                # quick and dirty hack to actually give useful information when
                # something goes wrong
                print("PROBLEMATIC METADATA VALUE HERE")
                print(
                    expected_md, actual_md, type(expected_md), type(actual_md)
                )
                raise

        # Not really "metadata", but just as a sanity check verify that the
        # initial qurro_balance of each sample is null (aka None in
        # python) -- this ensures that no samples will show up when the
        # visualization is initially displayed, which is the intended behavior.
        assert sample["qurro_balance"] is None

    # Check that every entry (sample x feature) matches with the BIOM table.
    # If the BIOM table has, say, > 1 million entries, this might be excessive,
    # but the test data right now is small enough that this should be fine.
    table = load_table(biom_table_loc)

    # For each (ranked) feature...
    for feature_id in count_json:
        # For each sample, ensure that the count value in the JSON matches with
        # the count value in the BIOM table.
        for sample_id in count_json[feature_id]:
            actual_count = count_json[feature_id][sample_id]
            expected_count = table.get_value_by_ids(feature_id, sample_id)
            assert actual_count == expected_count
Exemple #5
0
def process_input(
    feature_ranks,
    sample_metadata,
    biom_table,
    feature_metadata=None,
    extreme_feature_count=None,
):
    """Validates/processes the input files and parameter(s) to Qurro.

       In particular, this function

       1. Calls validate_df() and then check_column_names() on all of the
          input DataFrames passed (feature ranks, sample metadata, feature
          metadata if passed).

       2. Calls replace_nan() on the metadata DataFrame(s), so that all
          missing values are represented consistently with a None (which
          will be represented as a null in JSON/JavaScript).

       3. Converts the BIOM table to a SparseDataFrame by calling
          biom_table_to_sparse_df().

       4. Matches up the table with the feature ranks and sample metadata by
          calling match_table_and_data().

       5. Calls filter_unextreme_features() using the provided
          extreme_feature_count. (If it's None, then nothing will be done.)

       6. Calls remove_empty_samples_and_features() to filter empty samples
          (and features). This is purposefully done *after*
          filter_unextreme_features() is called.

       7. Calls merge_feature_metadata() on the feature ranks and feature
          metadata. (If feature metadata is None, nothing will be done.)

       Returns
       -------
       output_metadata: pd.DataFrame
            Sample metadata, but matched with the table and with empty samples
            removed.

       output_ranks: pd.DataFrame
            Feature ranks, post-filtering and with feature metadata columns
            added in.

       ranking_ids
            The ranking columns' names in output_ranks.

       feature_metadata_cols: list
            The feature metadata columns' names in output_ranks.

       output_table: pd.SparseDataFrame
            The BIOM table, post matching with the feature ranks and sample
            metadata and with empty samples removed.
    """

    logging.debug("Starting processing input.")

    validate_df(feature_ranks, "feature ranks", 2, 1)
    validate_df(sample_metadata, "sample metadata", 1, 1)
    if feature_metadata is not None:
        # It's cool if there aren't any features actually described in the
        # feature metadata (hence why we pass in 0 as the minimum # of rows in
        # the feature metadata DataFrame), but we still pass it to
        # validate_df() in order to ensure that:
        #   1) there's at least one feature metadata column (because
        #      otherwise the feature metadata is useless)
        #   2) column names are unique
        validate_df(feature_metadata, "feature metadata", 0, 1)

    check_column_names(sample_metadata, feature_ranks, feature_metadata)

    # Replace NaN values (which both _metadata_utils.read_metadata_file() and
    # qiime2.Metadata use to represent missing values, i.e. ""s) with None --
    # this is generally easier for us to handle in the JS side of things (since
    # it'll just be consistently converted to null by json.dumps()).
    sample_metadata = replace_nan(sample_metadata)
    if feature_metadata is not None:
        feature_metadata = replace_nan(feature_metadata)

    table = biom_table_to_sparse_df(biom_table)

    # Match up the table with the feature ranks and sample metadata.
    m_table, m_sample_metadata = match_table_and_data(table, feature_ranks,
                                                      sample_metadata)

    # Note that although we always call filter_unextreme_features(), filtering
    # isn't necessarily always done (whether or not depends on the value of
    # extreme_feature_count and the contents of the table/ranks).
    filtered_table, filtered_ranks = filter_unextreme_features(
        m_table, feature_ranks, extreme_feature_count)

    # Filter now-empty samples (and empty features) from the BIOM table.
    output_table, output_metadata, u_ranks = remove_empty_samples_and_features(
        filtered_table, m_sample_metadata, filtered_ranks)

    # Save a list of ranking IDs (before we add in feature metadata)
    # TODO: just have merge_feature_metadata() give us this?
    ranking_ids = u_ranks.columns

    output_ranks, feature_metadata_cols = merge_feature_metadata(
        u_ranks, feature_metadata)

    logging.debug("Finished input processing.")
    return (
        output_metadata,
        output_ranks,
        ranking_ids,
        feature_metadata_cols,
        output_table,
    )