def plot( ranks: str, table: str, sample_metadata: str, feature_metadata: str, output_dir: str, extreme_feature_count: int, assume_gnps_feature_metadata: bool, debug: bool, ) -> None: """Generates a visualization of feature rankings and log-ratios. The resulting visualization contains two plots. The first plot shows how features are ranked, and the second plot shows the log-ratio of "selected" features' abundances within samples. The visualization is interactive, so which features are "selected" to construct log-ratios -- as well as various other properties of the visualization -- can be changed by the user. """ # inspired by https://stackoverflow.com/a/14098306/10730311 if debug: logging.basicConfig(level=logging.DEBUG) logging.debug("Starting the standalone Qurro script.") loaded_biom = load_table(table) logging.debug("Loaded BIOM table.") df_sample_metadata = escape_columns(read_metadata_file(sample_metadata), "sample metadata") feature_ranks = read_rank_file(ranks) df_feature_metadata = None if feature_metadata is not None: if assume_gnps_feature_metadata: df_feature_metadata = read_gnps_feature_metadata_file( feature_metadata, feature_ranks) else: df_feature_metadata = escape_columns( read_metadata_file(feature_metadata), "feature metadata") logging.debug("Read in metadata.") process_and_generate( feature_ranks, df_sample_metadata, loaded_biom, output_dir, df_feature_metadata, extreme_feature_count, ) print("Successfully generated a visualization in the folder {}.".format( output_dir))
def validate_rank_plot_json( biom_table_loc, metadata_loc, input_ranks_loc, rank_json ): """Ensure that the rank plot JSON makes sense.""" # TODO check that feature metadata annotations were properly applied to the # features. Will need the feature metadata file location to be passed here ref_feature_ranks = read_rank_file(input_ranks_loc) # Load the table as a Sparse DF, and then match it up with the sample # metadata. This is needed in order to ensure that the table only describes # samples in the sample metadata. # (And the reason we do *that* is so that, when we're trying to figure out # if a feature is "empty," we can just compute the sum of that feature's # row in the table -- which we couldn't do if the table contained samples # that would be filtered out in Qurro.) table = biom_table_to_sparse_df(load_table(biom_table_loc)) sample_metadata = read_metadata_file(metadata_loc) table, _ = match_table_and_data(table, ref_feature_ranks, sample_metadata) # Validate some basic properties of the plot # (This is all handled by Altair, so these property tests aren't # exhaustive; they're mainly intended to verify that a general plot # matching our specs is being created) assert rank_json["mark"] == "bar" assert rank_json["title"] == "Features" basic_vegalite_json_validation(rank_json) # Loop over every feature in the reference feature ranks. Check that each # feature's corresponding rank data in the rank plot JSON matches. rank_ordering = rank_json["datasets"]["qurro_rank_ordering"] rank_json_feature_data = get_data_from_plot_json( rank_json, id_field="Feature ID" ) for ref_feature_id in ref_feature_ranks.index: # If this feature is empty, it should have been filtered! if sum(table.loc[ref_feature_id]) == 0: assert ref_feature_id not in rank_json_feature_data continue # ...If this feature isn't empty, though, it shouldn't have been # filtered. (We assume that the user didn't pass in -x in this test.) # # Check to make sure that this feature ID is actually in the rank plot # JSON assert ref_feature_id in rank_json_feature_data # Get the corresponding feature's ranking information stored in the # rank plot JSON json_feature_data = rank_json_feature_data[ref_feature_id] # Note that we allow for mismatches in ranking names between the # reference and JSON feature rank data -- instead, we compare based on # the *order* of the feature rankings (aka the order of the columns in # either the feature differentials or ordination feature loadings). # This is fine, because we may want to rename certain rankings' names # (e.g. the axes in DEICODE's feature loadings, which default to just # 0, 1, 2) for ref_ranking, json_ranking in zip_longest( ref_feature_ranks.columns, rank_ordering ): # We use pytest's approx class to get past floating point # imprecisions. Note that we just leave this at the default for # approx, so if this starts failing then adjusting the tolerances # in approx() might be needed. actual_rank_val = ref_feature_ranks[ref_ranking][ref_feature_id] assert actual_rank_val == approx(json_feature_data[json_ranking])