def assert_melt(df: pd.DataFrame, eval_metric: str = "replicate_reproducibility") -> None: r"""Helper function to ensure that we properly melted the pairwise correlation matrix Downstream functions depend on how we process the pairwise correlation matrix. The processing is different depending on the evaluation metric. Parameters ---------- df : pandas.DataFrame A melted pairwise correlation matrix eval_metric : str The user input eval metric Returns ------- None Assertion will fail if we incorrectly melted the matrix """ check_eval_metric(eval_metric=eval_metric) pair_ids = set_pair_ids() df = df.loc[:, [pair_ids[x]["index"] for x in pair_ids]] index_sums = df.sum().tolist() assert_error = "Stop! The eval_metric provided in 'metric_melt()' is incorrect!" assert_error = "{err} This is a fatal error providing incorrect results".format( err=assert_error) if eval_metric == "replicate_reproducibility": assert index_sums[0] != index_sums[1], assert_error elif eval_metric == "precision_recall": assert index_sums[0] == index_sums[1], assert_error elif eval_metric == "grit": assert index_sums[0] == index_sums[1], assert_error
def check_replicate_groups(eval_metric: str, replicate_groups: Union[List[str], dict]) -> None: r"""Helper function checking that the user correctly constructed the input replicate groups argument The package will not calculate evaluation metrics with incorrectly constructed replicate_groups. See :py:func:`cytominer_eval.evaluate.evaluate`. Parameters ---------- eval_metric : str Which evaluation metric to calculate. See :py:func:`cytominer_eval.transform.util.get_available_eval_metrics`. replicate_groups : {list, dict} The tentative data structure listing replicate groups Returns ------- None Assertion will fail for improperly constructed replicate_groups """ check_eval_metric(eval_metric=eval_metric) if eval_metric == "grit": assert isinstance(replicate_groups, dict), "For grit, replicate_groups must be a dict" replicate_key_ids = ["profile_col", "replicate_group_col"] assert all( [x in replicate_groups for x in replicate_key_ids] ), "replicate_groups for grit not formed properly. Must contain {id}".format( id=replicate_key_ids) elif eval_metric == "mp_value": assert isinstance( replicate_groups, str), "For mp_value, replicate_groups must be a single string." else: assert isinstance( replicate_groups, list ), "Replicate groups must be a list for the {op} operation".format( op=eval_metric)
def test_check_eval_metric(): with pytest.raises(AssertionError) as ae: output = check_eval_metric(eval_metric="MISSING") assert "MISSING not supported. Select one of" in str(ae.value)
def process_melt( df: pd.DataFrame, meta_df: pd.DataFrame, eval_metric: str = "replicate_reproducibility", ) -> pd.DataFrame: """Helper function to annotate and process an input similarity matrix Parameters ---------- df : pandas.DataFrame A similarity matrix output from :py:func:`cytominer_eval.transform.transform.get_pairwise_metric` meta_df : pandas.DataFrame A wide matrix of metadata information where the index aligns to the similarity matrix index eval_metric : str, optional Which metric to ultimately calculate. Determines whether or not to keep the full similarity matrix or only one diagonal. Defaults to "replicate_reproducibility". Returns ------- pandas.DataFrame A pairwise similarity matrix """ # Confirm that the user formed the input arguments properly assert df.shape[0] == df.shape[1], "Matrix must be symmetrical" check_eval_metric(eval_metric) # Get identifiers for pairing metadata pair_ids = set_pair_ids() # Subset the pairwise similarity metric depending on the eval metric given: # "replicate_reproducibility" - requires only the upper triangle of a symmetric matrix # "precision_recall" - requires the full symmetric matrix (no diagonal) # Remove pairwise matrix diagonal and redundant pairwise comparisons if eval_metric == "replicate_reproducibility": upper_tri = get_upper_matrix(df) df = df.where(upper_tri) else: np.fill_diagonal(df.values, np.nan) # Convert pairwise matrix to melted (long) version based on index value metric_unlabeled_df = ( pd.melt( df.reset_index(), id_vars="index", value_vars=df.columns, var_name=pair_ids["pair_b"]["index"], value_name="similarity_metric", ) .dropna() .reset_index(drop=True) .rename({"index": pair_ids["pair_a"]["index"]}, axis="columns") ) # Merge metadata on index for both comparison pairs output_df = meta_df.merge( meta_df.merge( metric_unlabeled_df, left_index=True, right_on=pair_ids["pair_b"]["index"], ), left_index=True, right_on=pair_ids["pair_a"]["index"], suffixes=[pair_ids["pair_a"]["suffix"], pair_ids["pair_b"]["suffix"]], ).reset_index(drop=True) return output_df