def test_read_metadata_file_whitespace_stripping(): """Tests that whitespace is properly stripped when reading a metadata file. """ ws = os.path.join("qurro", "tests", "input", "metadata_tests", "whitespace.tsv") ws_df = replace_nan(read_metadata_file(ws)) # Following stuff is *mostly* copied from # test_read_metadata_file_complex(), with some added tweaks # Check that dtypes match up (should all be object) assert ws_df.index.dtype == "object" for col in ws_df.columns: assert ws_df[col].dtype == "object" # Check that index IDs were treated as strings (issue #116 on the Qurro # GitHub page) assert ws_df.index.equals( Index(["01", "02", "03", "04", "05", "06", "07", "08"])) assert ws_df.at["01", "Metadata1"] is None assert ws_df.at["01", "Metadata2"] is None assert ws_df.at["01", "Metadata3"] is None assert ws_df.at["02", "Metadata1"] == "4" assert ws_df.at["02", "Metadata2"] == "'5'" # The leading spaces in this value should be ignored assert ws_df.at["02", "Metadata3"] == "MISSING LOL" assert ws_df.at["03", "Metadata1"] is None assert ws_df.at["03", "Metadata2"] == "8" assert ws_df.at["03", "Metadata3"] == "9" assert ws_df.at["04", "Metadata1"] == "10" assert ws_df.at["04", "Metadata2"] == "null" assert ws_df.at["04", "Metadata3"] == "12" assert ws_df.at["05", "Metadata1"] == "13" assert ws_df.at["05", "Metadata2"] is None assert ws_df.at["05", "Metadata3"] is None assert ws_df.at["06", "Metadata1"] == "16" # The trailing spaces in this value should be ignored assert ws_df.at["06", "Metadata2"] == "17" # This entire value should be treated as equivalent to a "" assert ws_df.at["06", "Metadata3"] is None assert ws_df.at["07", "Metadata1"] == "NaN" # This entire value should be treated as equivalent to a "" assert ws_df.at["07", "Metadata2"] is None assert ws_df.at["07", "Metadata3"] == "21" assert ws_df.at["08", "Metadata1"] == "22" assert ws_df.at["08", "Metadata2"] is None assert ws_df.at["08", "Metadata3"] == "Infinity" assert_frame_equal(ws_df, replace_nan(qiime2.Metadata.load(ws).to_dataframe()))
def test_read_metadata_file_complex(): """Tests some corner cases in replace_nan(read_metadata_file()).""" weird = os.path.join( "qurro", "tests", "input", "metadata_tests", "weird_metadata.tsv" ) weird_df = replace_nan(read_metadata_file(weird)) # Check that dtypes match up (should all be object) assert weird_df.index.dtype == "object" for col in weird_df.columns: assert weird_df[col].dtype == "object" # Check that index IDs were treated as strings (issue #116 on the Qurro # GitHub page) assert weird_df.index.equals( Index(["01", "02", "03", "04", "05", "06", "07", "08"]) ) # Check that all of the weird stuff in this file was correctly handled # (i.e. all ""s were converted to Nones, and everything else is preserved # as a string) assert weird_df.at["01", "Metadata1"] is None assert weird_df.at["01", "Metadata2"] is None assert weird_df.at["01", "Metadata3"] is None assert weird_df.at["02", "Metadata1"] == "4" assert weird_df.at["02", "Metadata2"] == "'5'" assert weird_df.at["02", "Metadata3"] == "MISSING LOL" assert weird_df.at["03", "Metadata1"] is None assert weird_df.at["03", "Metadata2"] == "8" assert weird_df.at["03", "Metadata3"] == "9" assert weird_df.at["04", "Metadata1"] == "10" assert weird_df.at["04", "Metadata2"] == "null" assert weird_df.at["04", "Metadata3"] == "12" assert weird_df.at["05", "Metadata1"] == "13" assert weird_df.at["05", "Metadata2"] is None assert weird_df.at["05", "Metadata3"] is None assert weird_df.at["06", "Metadata1"] == "16" assert weird_df.at["06", "Metadata2"] == "17" assert weird_df.at["06", "Metadata3"] is None assert weird_df.at["07", "Metadata1"] == "NaN" assert weird_df.at["07", "Metadata2"] is None assert weird_df.at["07", "Metadata3"] == "21" assert weird_df.at["08", "Metadata1"] == "22" assert weird_df.at["08", "Metadata2"] is None assert weird_df.at["08", "Metadata3"] == "Infinity" assert_frame_equal( weird_df, replace_nan(qiime2.Metadata.load(weird).to_dataframe()) )
def test_replace_nan(): """Tests replace_nan().""" # Basic case: other data are ints df = DataFrame({"x": [1, np.NaN], "y": [3, 4]}, index=["a", "b"]) dfC = DataFrame({"x": [1, None], "y": [3, 4]}, index=["a", "b"]) assert_frame_equal(dfC, replace_nan(df), check_dtype=False) # Other data are strs df2 = DataFrame( {"x": ["abc", np.NaN], "y": ["ghi", "jkl"]}, index=["a", "b"] ) dfC2 = DataFrame( {"x": ["abc", None], "y": ["ghi", "jkl"]}, index=["a", "b"] ) assert_frame_equal(dfC2, replace_nan(df2), check_dtype=False) # Entire Series of NaNs df3 = DataFrame( {"x": [np.NaN, np.NaN], "y": ["ghi", "jkl"]}, index=["a", "b"] ) dfC3 = DataFrame( {"x": [None, None], "y": ["ghi", "jkl"]}, index=["a", "b"] ) assert_frame_equal(dfC3, replace_nan(df3), check_dtype=False) # Entire DataFrame of NaNs df4 = DataFrame( {"x": [np.NaN, np.NaN], "y": [np.NaN, np.NaN]}, index=["a", "b"] ) dfC4 = DataFrame({"x": [None, None], "y": [None, None]}, index=["a", "b"]) assert_frame_equal(dfC4, replace_nan(df4), check_dtype=False) # If there are already Nones inside the DF for some reason (should never be # the case, but might as well be safe and check this) df5 = DataFrame( {"x": [np.NaN, None], "y": [np.NaN, np.NaN]}, index=["a", "b"] ) dfC5 = DataFrame({"x": [None, None], "y": [None, None]}, index=["a", "b"]) assert_frame_equal(dfC5, replace_nan(df5), check_dtype=False) # Case where the user specifies an alternate value to replace NaNs with df6 = DataFrame( {"x": [np.NaN, 3], "y": [np.NaN, np.NaN]}, index=["a", "b"] ) dfC6 = DataFrame({"x": ["lol", 3], "y": ["lol", "lol"]}, index=["a", "b"]) assert_frame_equal(dfC6, replace_nan(df6, "lol"), check_dtype=False)
def validate_sample_plot_json( biom_table_loc, metadata_loc, sample_json, count_json ): assert sample_json["mark"] == {"type": "circle"} assert sample_json["title"] == "Samples" basic_vegalite_json_validation(sample_json) dn = sample_json["data"]["name"] # Assert that sample metadata fields are in alphabetical order, ignoring # case. As in Qurro's code, this solution for sorting this way is based on # this article: # https://www.afternerd.com/blog/python-sort-list/#sort-strings-case-insensitive sm_fields = sample_json["datasets"]["qurro_sample_metadata_fields"] assert sorted(sm_fields, key=str.lower) == sm_fields # Check that each sample's metadata in the sample plot JSON matches with # its actual metadata. # NOTE: here we make the assumption that all samples are non-empty. # If we start using integration test data with empty samples, then we'll # need to revise this function to do something akin to what # validate_rank_plot_json() does above to ensure that empty features are # filtered out. sample_metadata = replace_nan(read_metadata_file(metadata_loc)) for sample in sample_json["datasets"][dn]: sample_id = sample["Sample ID"] for metadata_col in sample_metadata.columns: expected_md = sample_metadata.at[sample_id, metadata_col] actual_md = sample[metadata_col] try: # Either these values are equal, *or* this was a QIIME 2 # integration test (in which case the metadata files were # loaded as qiime2.Metadata objects) and certain columns' # values have been treated as numeric (which is fine, but this # might result in some things like a value of 53 being # interpreted as a value of 53.0 to qiime2.Metadata -- this # isn't a problem, so if the first check of equality fails we # try a looser check using approx() and float(). assert expected_md == actual_md or float( expected_md ) == approx(float(actual_md)) except AssertionError: # quick and dirty hack to actually give useful information when # something goes wrong print("PROBLEMATIC METADATA VALUE HERE") print( expected_md, actual_md, type(expected_md), type(actual_md) ) raise # Not really "metadata", but just as a sanity check verify that the # initial qurro_balance of each sample is null (aka None in # python) -- this ensures that no samples will show up when the # visualization is initially displayed, which is the intended behavior. assert sample["qurro_balance"] is None # Check that every entry (sample x feature) matches with the BIOM table. # If the BIOM table has, say, > 1 million entries, this might be excessive, # but the test data right now is small enough that this should be fine. table = load_table(biom_table_loc) # For each (ranked) feature... for feature_id in count_json: # For each sample, ensure that the count value in the JSON matches with # the count value in the BIOM table. for sample_id in count_json[feature_id]: actual_count = count_json[feature_id][sample_id] expected_count = table.get_value_by_ids(feature_id, sample_id) assert actual_count == expected_count
def process_input( feature_ranks, sample_metadata, biom_table, feature_metadata=None, extreme_feature_count=None, ): """Validates/processes the input files and parameter(s) to Qurro. In particular, this function 1. Calls validate_df() and then check_column_names() on all of the input DataFrames passed (feature ranks, sample metadata, feature metadata if passed). 2. Calls replace_nan() on the metadata DataFrame(s), so that all missing values are represented consistently with a None (which will be represented as a null in JSON/JavaScript). 3. Converts the BIOM table to a SparseDataFrame by calling biom_table_to_sparse_df(). 4. Matches up the table with the feature ranks and sample metadata by calling match_table_and_data(). 5. Calls filter_unextreme_features() using the provided extreme_feature_count. (If it's None, then nothing will be done.) 6. Calls remove_empty_samples_and_features() to filter empty samples (and features). This is purposefully done *after* filter_unextreme_features() is called. 7. Calls merge_feature_metadata() on the feature ranks and feature metadata. (If feature metadata is None, nothing will be done.) Returns ------- output_metadata: pd.DataFrame Sample metadata, but matched with the table and with empty samples removed. output_ranks: pd.DataFrame Feature ranks, post-filtering and with feature metadata columns added in. ranking_ids The ranking columns' names in output_ranks. feature_metadata_cols: list The feature metadata columns' names in output_ranks. output_table: pd.SparseDataFrame The BIOM table, post matching with the feature ranks and sample metadata and with empty samples removed. """ logging.debug("Starting processing input.") validate_df(feature_ranks, "feature ranks", 2, 1) validate_df(sample_metadata, "sample metadata", 1, 1) if feature_metadata is not None: # It's cool if there aren't any features actually described in the # feature metadata (hence why we pass in 0 as the minimum # of rows in # the feature metadata DataFrame), but we still pass it to # validate_df() in order to ensure that: # 1) there's at least one feature metadata column (because # otherwise the feature metadata is useless) # 2) column names are unique validate_df(feature_metadata, "feature metadata", 0, 1) check_column_names(sample_metadata, feature_ranks, feature_metadata) # Replace NaN values (which both _metadata_utils.read_metadata_file() and # qiime2.Metadata use to represent missing values, i.e. ""s) with None -- # this is generally easier for us to handle in the JS side of things (since # it'll just be consistently converted to null by json.dumps()). sample_metadata = replace_nan(sample_metadata) if feature_metadata is not None: feature_metadata = replace_nan(feature_metadata) table = biom_table_to_sparse_df(biom_table) # Match up the table with the feature ranks and sample metadata. m_table, m_sample_metadata = match_table_and_data(table, feature_ranks, sample_metadata) # Note that although we always call filter_unextreme_features(), filtering # isn't necessarily always done (whether or not depends on the value of # extreme_feature_count and the contents of the table/ranks). filtered_table, filtered_ranks = filter_unextreme_features( m_table, feature_ranks, extreme_feature_count) # Filter now-empty samples (and empty features) from the BIOM table. output_table, output_metadata, u_ranks = remove_empty_samples_and_features( filtered_table, m_sample_metadata, filtered_ranks) # Save a list of ranking IDs (before we add in feature metadata) # TODO: just have merge_feature_metadata() give us this? ranking_ids = u_ranks.columns output_ranks, feature_metadata_cols = merge_feature_metadata( u_ranks, feature_metadata) logging.debug("Finished input processing.") return ( output_metadata, output_ranks, ranking_ids, feature_metadata_cols, output_table, )