Example #1
0
def plot(
    ranks: str,
    table: str,
    sample_metadata: str,
    feature_metadata: str,
    output_dir: str,
    extreme_feature_count: int,
    assume_gnps_feature_metadata: bool,
    debug: bool,
) -> None:
    """Generates a visualization of feature rankings and log-ratios.

       The resulting visualization contains two plots. The first plot shows
       how features are ranked, and the second plot shows the log-ratio
       of "selected" features' abundances within samples.

       The visualization is interactive, so which features are "selected" to
       construct log-ratios -- as well as various other properties of the
       visualization -- can be changed by the user.
    """

    # inspired by https://stackoverflow.com/a/14098306/10730311
    if debug:
        logging.basicConfig(level=logging.DEBUG)

    logging.debug("Starting the standalone Qurro script.")
    loaded_biom = load_table(table)
    logging.debug("Loaded BIOM table.")
    df_sample_metadata = escape_columns(read_metadata_file(sample_metadata),
                                        "sample metadata")
    feature_ranks = read_rank_file(ranks)

    df_feature_metadata = None
    if feature_metadata is not None:
        if assume_gnps_feature_metadata:
            df_feature_metadata = read_gnps_feature_metadata_file(
                feature_metadata, feature_ranks)
        else:
            df_feature_metadata = escape_columns(
                read_metadata_file(feature_metadata), "feature metadata")
    logging.debug("Read in metadata.")

    process_and_generate(
        feature_ranks,
        df_sample_metadata,
        loaded_biom,
        output_dir,
        df_feature_metadata,
        extreme_feature_count,
    )
    print("Successfully generated a visualization in the folder {}.".format(
        output_dir))
Example #2
0
def validate_rank_plot_json(
    biom_table_loc, metadata_loc, input_ranks_loc, rank_json
):
    """Ensure that the rank plot JSON makes sense."""

    # TODO check that feature metadata annotations were properly applied to the
    # features. Will need the feature metadata file location to be passed here

    ref_feature_ranks = read_rank_file(input_ranks_loc)

    # Load the table as a Sparse DF, and then match it up with the sample
    # metadata. This is needed in order to ensure that the table only describes
    # samples in the sample metadata.
    # (And the reason we do *that* is so that, when we're trying to figure out
    # if a feature is "empty," we can just compute the sum of that feature's
    # row in the table -- which we couldn't do if the table contained samples
    # that would be filtered out in Qurro.)
    table = biom_table_to_sparse_df(load_table(biom_table_loc))
    sample_metadata = read_metadata_file(metadata_loc)
    table, _ = match_table_and_data(table, ref_feature_ranks, sample_metadata)

    # Validate some basic properties of the plot
    # (This is all handled by Altair, so these property tests aren't
    # exhaustive; they're mainly intended to verify that a general plot
    # matching our specs is being created)
    assert rank_json["mark"] == "bar"
    assert rank_json["title"] == "Features"
    basic_vegalite_json_validation(rank_json)

    # Loop over every feature in the reference feature ranks. Check that each
    # feature's corresponding rank data in the rank plot JSON matches.
    rank_ordering = rank_json["datasets"]["qurro_rank_ordering"]
    rank_json_feature_data = get_data_from_plot_json(
        rank_json, id_field="Feature ID"
    )

    for ref_feature_id in ref_feature_ranks.index:
        # If this feature is empty, it should have been filtered!
        if sum(table.loc[ref_feature_id]) == 0:
            assert ref_feature_id not in rank_json_feature_data
            continue
        # ...If this feature isn't empty, though, it shouldn't have been
        # filtered. (We assume that the user didn't pass in -x in this test.)
        #
        # Check to make sure that this feature ID is actually in the rank plot
        # JSON
        assert ref_feature_id in rank_json_feature_data
        # Get the corresponding feature's ranking information stored in the
        # rank plot JSON
        json_feature_data = rank_json_feature_data[ref_feature_id]

        # Note that we allow for mismatches in ranking names between the
        # reference and JSON feature rank data -- instead, we compare based on
        # the *order* of the feature rankings (aka the order of the columns in
        # either the feature differentials or ordination feature loadings).
        # This is fine, because we may want to rename certain rankings' names
        # (e.g. the axes in DEICODE's feature loadings, which default to just
        # 0, 1, 2)
        for ref_ranking, json_ranking in zip_longest(
            ref_feature_ranks.columns, rank_ordering
        ):
            # We use pytest's approx class to get past floating point
            # imprecisions. Note that we just leave this at the default for
            # approx, so if this starts failing then adjusting the tolerances
            # in approx() might be needed.
            actual_rank_val = ref_feature_ranks[ref_ranking][ref_feature_id]
            assert actual_rank_val == approx(json_feature_data[json_ranking])