Beispiel #1
0
def test_add_sample_presence_count_zeros():
    """Checks the case when some features aren't present in any samples."""

    table, metadata, ranks = get_test_data()

    # Test 1: zero out all counts for feature F3
    table.loc["F3"] = 0
    output_feature_data = add_sample_presence_count(ranks, table)
    assert_series_equal(
        output_feature_data["qurro_spc"],
        Series([3, 2, 0, 3, 2, 2, 2, 2], index=ranks.index, name="qurro_spc"),
    )
    verify_spc_data_integrity(output_feature_data, ranks)

    # Test 2: zero out all counts
    table.loc[:] = 0
    ofd_2 = add_sample_presence_count(ranks, table)
    assert_series_equal(
        ofd_2["qurro_spc"],
        Series([0] * 8, index=ranks.index, name="qurro_spc"),
    )
    verify_spc_data_integrity(ofd_2, ranks)

    # Test 3: just one count for one feature
    table["Sample4"]["F2"] = 1
    ofd_3 = add_sample_presence_count(ranks, table)
    assert_series_equal(
        ofd_3["qurro_spc"],
        Series([0, 1, 0, 0, 0, 0, 0, 0], index=ranks.index, name="qurro_spc"),
    )
    verify_spc_data_integrity(ofd_3, ranks)
Beispiel #2
0
def test_add_sample_presence_count_name_error():
    """Checks the case where the feature data already contains a column
    called qurro_spc.

    This should never happen due to check_column_names() being called, but
    we might as well be careful.
    """
    table, metadata, ranks = get_test_data()
    ranks.columns = ["Rank 0", "qurro_spc"]
    with pytest.raises(ValueError):
        add_sample_presence_count(ranks, table)
Beispiel #3
0
def test_add_sample_presence_count_basic():

    # NOTE: for reference, the get_test_data() table initially looks like this:
    # "Sample1": [1, 2, 3, 4, 5, 6, 7, 8],
    # "Sample2": [8, 7, 6, 5, 4, 3, 2, 1],
    # "Sample3": [1, 0, 0, 0, 0, 0, 0, 0],
    # "Sample4": [0, 0, 0, 1, 0, 0, 0, 0],
    table, metadata, ranks = get_test_data()

    # Test a basic case.
    output_feature_data = add_sample_presence_count(ranks, table)
    assert_series_equal(
        output_feature_data["qurro_spc"],
        Series([3, 2, 2, 3, 2, 2, 2, 2], index=ranks.index, name="qurro_spc"),
    )
    # Make sure that the underlying feature data remains the same
    verify_spc_data_integrity(output_feature_data, ranks)
Beispiel #4
0
def gen_rank_plot(V, rank_type, ranking_ids, feature_metadata_cols, table_sdf):
    """Uses Altair to generate a JSON Vega-Lite spec for the rank plot.

    Parameters
    ----------

    V: pd.DataFrame
        DataFrame containing feature rank (and feature metadata, if applicable)
        information. (Indices correspond to features, and columns correspond
        to feature ranking or feature metadata fields.)

        This should have already been matched with the BIOM table, filtered (if
        -x passed), had empty features removed, etc.

    rank_type: str
        Human-readable name for a given ranking column that will be used as the
        prefix for each y-axis label in the rank plot. (This should be either
        "Differential" or "Feature Loading".)

    ranking_ids: pd.Index
        IDs of the actual "feature ranking" columns in V.

    feature_metadata_cols: pd.Index or list
        IDs of the "feature metadata" columns in V (if there wasn't any
        feature metadata provided, this can just be an empty list).

    table_sdf: pd.SparseDataFrame
        A representation of the input BIOM table containing count data. This
        is used to calculate qurro_spc (the number of samples a feature is
        present in) for each feature in V. This should ONLY contain samples
        that will be used in the Qurro visualization -- the presence of extra
        samples will mess up _df_utils.add_sample_presence_count().

    Returns
    -------

    rank_chart_json: dict
        A dict version of the alt.Chart for the rank plot, with
        qurro_rank_ordering and qurro_feature_metadata_ordering datasets
        added in indicating which columns describe feature rankings and
        which describe feature metadata. (Also has a qurro_rank_type "dataset"
        (really just a string) that points to the specified rank_type.)
    """

    rank_data = V.copy()

    # NOTE that until this point we've treated the actual rank values as just
    # "objects", as far as pandas is concerned. However, if we continue to
    # treat them as objects when sorting them, we'll get a list of feature
    # ranks in lexicographic order... which is not what we want. So we just
    # ensure that all of the columns contain numeric data.
    for col in ranking_ids:
        rank_data[col] = pd.to_numeric(rank_data[col])

    # The default rank column is just whatever the first rank is. This is what
    # the rank plot will use when it's first drawn.
    default_rank_col = ranking_ids[0]

    # Set default classification of every feature to "None"
    # (This value will be updated when a feature is selected in the rank plot
    # as part of the numerator, denominator, or both parts of the current log
    # ratio.)
    rank_data["qurro_classification"] = "None"

    # Add a "qurro_spc" column indicating how many samples each feature is
    # present in.
    rank_data = add_sample_presence_count(rank_data, table_sdf)

    # Replace "index" with "Feature ID". looks nicer in the visualization :)
    rank_data.rename_axis("Feature ID", axis="index", inplace=True)
    rank_data.reset_index(inplace=True)

    # Now, we can actually create the rank plot.
    rank_chart = (
        alt.Chart(
            rank_data,
            title="Features",
            background="#FFFFFF",
            autosize=alt.AutoSizeParams(resize=True),
        ).mark_bar().transform_window(
            sort=[alt.SortField(field=default_rank_col, order="ascending")],
            # We don't use an alt.WindowFieldDef here because python gets
            # confused when you use "as" as an actual argument name. So we just
            # use this syntax.
            window=[{
                "op": "row_number",
                "as": "qurro_x"
            }],
        ).encode(
            # type="ordinal" needed on the scale here to make bars adjacent;
            # see https://stackoverflow.com/a/55544817/10730311.
            x=alt.X(
                "qurro_x",
                title="Feature Rankings",
                type="ordinal",
                scale=alt.Scale(paddingOuter=1, paddingInner=0, rangeStep=1),
                axis=alt.Axis(ticks=False, labelAngle=0),
            ),
            y=alt.Y(default_rank_col, type="quantitative"),
            color=alt.Color(
                "qurro_classification",
                title="Log-Ratio Classification",
                scale=alt.Scale(
                    domain=["None", "Numerator", "Denominator", "Both"],
                    range=["#e0e0e0", "#f00", "#00f", "#949"],
                ),
            ),
            tooltip=[
                alt.Tooltip(
                    field="qurro_x",
                    title="Current Ranking",
                    type="quantitative",
                ),
                alt.Tooltip(
                    field="qurro_classification",
                    title="Log-Ratio Classification",
                    type="nominal",
                ),
                alt.Tooltip(
                    field="qurro_spc",
                    title="Sample Presence Count",
                    type="quantitative",
                ),
                "Feature ID",
                *feature_metadata_cols,
                *ranking_ids,
            ],
        ).configure_axis(
            # Done in order to differentiate "None"-classification features
            # from grid lines
            gridColor="#f2f2f2",
            labelBound=True,
        ).interactive())

    rank_chart_json = rank_chart.to_dict()
    rank_ordering = "qurro_rank_ordering"
    fm_col_ordering = "qurro_feature_metadata_ordering"
    dataset_name_for_rank_type = "qurro_rank_type"
    check_json_dataset_names(rank_chart_json, rank_ordering, fm_col_ordering,
                             rank_type)

    # Note we don't use rank_data.columns for setting the rank ordering. This
    # is because rank_data's columns now include both the ranking IDs and the
    # "Feature ID" and "qurro_classification" columns (as well as any feature
    # metadata the user saw fit to pass in).
    rank_chart_json["datasets"][rank_ordering] = list(ranking_ids)
    rank_chart_json["datasets"][fm_col_ordering] = list(feature_metadata_cols)
    rank_chart_json["datasets"][dataset_name_for_rank_type] = rank_type
    return rank_chart_json