コード例 #1
0
def test_assert_melt():
    for metric in ["precision_recall", "replicate_reproducibility", "grit"]:
        result = metric_melt(
            df=df,
            features=features,
            metadata_features=meta_features,
            similarity_metric="pearson",
            eval_metric=metric,
        )

        result = assign_replicates(
            similarity_melted_df=result, replicate_groups=replicate_groups
        )

        assert_melt(result, eval_metric=metric)

        # Note, not all alternative dummy metrics are provided, since many require
        # the same melted dataframe
        if metric == "precision_recall":
            dummy_metrics = ["replicate_reproducibility"]
        elif metric == "replicate_reproducibility":
            dummy_metrics = ["precision_recall", "grit"]
        elif metric == "grit":
            dummy_metrics = ["replicate_reproducibility"]

        for dummy_metric in dummy_metrics:
            with pytest.raises(AssertionError) as ve:
                output = assert_melt(result, eval_metric=dummy_metric)
            assert (
                "Stop! The eval_metric provided in 'metric_melt()' is incorrect!"
                in str(ve.value)
            )
コード例 #2
0
def test_metric_melt():
    result_df = metric_melt(df,
                            features,
                            meta_features,
                            similarity_metric="pearson")
    assert round(result_df.similarity_metric[0],
                 3) == round(example_sample_corr, 3)
    assert result_df.shape[0] == 73536

    # The index ID is extremely important for aligning the dataframe
    # make sure the method is robust to indeces labeled inconsistently
    same_index_copy = df.copy()
    same_index_copy.index = [3] * same_index_copy.shape[0]

    result_df = metric_melt(same_index_copy,
                            features,
                            meta_features,
                            similarity_metric="pearson")

    assert round(result_df.similarity_metric[0],
                 3) == round(example_sample_corr, 3)
    assert result_df.shape[0] == 73536

    with pytest.raises(AssertionError) as ve:
        output = metric_melt(
            df,
            features,
            meta_features,
            similarity_metric="pearson",
            eval_metric="NOT SUPPORTED",
        )
    assert "not supported. Available evaluation metrics:" in str(ve.value)

    for full_metric_required in ["precision_recall", "grit"]:
        result_df = metric_melt(
            same_index_copy,
            features,
            meta_features,
            similarity_metric="pearson",
            eval_metric=full_metric_required,
        )

        assert round(result_df.similarity_metric[0],
                     3) == round(example_sample_corr, 3)
        assert result_df.shape[0] == 147072
コード例 #3
0
def test_calculate_precision_recall():
    similarity_melted_df = metric_melt(
        df=df,
        features=features,
        metadata_features=meta_features,
        similarity_metric="pearson",
        eval_metric="precision_recall",
    )

    replicate_groups = ["Metadata_broad_sample"]
    result = assign_replicates(similarity_melted_df=similarity_melted_df,
                               replicate_groups=replicate_groups).sort_values(
                                   by="similarity_metric", ascending=False)

    pair_ids = set_pair_ids()
    replicate_group_cols = [
        "{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"])
        for x in replicate_groups
    ]

    example_group = result.groupby(replicate_group_cols).get_group(
        name=("BRD-A38592941-001-02-7"))

    assert example_group.shape[
        0] == 383 * 6  # number of pairwise comparisons per dose

    # Assert that the similarity metrics are sorted
    assert (example_group.similarity_metric.diff().dropna() > 0).sum() == 0

    # Perform the calculation!
    result = pd.DataFrame(calculate_precision_recall(example_group, k=10),
                          columns=["result"])

    expected_result = {"k": 10, "precision": 0.4, "recall": 0.1333}
    expected_result = pd.DataFrame(expected_result,
                                   index=["result"]).transpose()

    assert_frame_equal(result, expected_result, check_less_precise=True)

    # Check that recall is 1 when k is maximized
    result = pd.DataFrame(
        calculate_precision_recall(example_group, k=example_group.shape[0]),
        columns=["result"],
    )

    assert result.loc["recall", "result"] == 1
コード例 #4
0
def evaluate(
    profiles: pd.DataFrame,
    features: List[str],
    meta_features: List[str],
    replicate_groups: Union[List[str], dict],
    operation: str = "percent_strong",
    similarity_metric: str = "pearson",
    percent_strong_quantile: np.float = 0.95,
    precision_recall_k: int = 10,
    grit_control_perts: List[str] = ["None"],
):
    # Check replicate groups input
    check_replicate_groups(eval_metric=operation, replicate_groups=replicate_groups)

    # Melt the input profiles to long format
    similarity_melted_df = metric_melt(
        df=profiles,
        features=features,
        metadata_features=meta_features,
        similarity_metric=similarity_metric,
        eval_metric=operation,
    )

    # Perform the input operation
    if operation == "percent_strong":
        metric_result = percent_strong(
            similarity_melted_df=similarity_melted_df,
            replicate_groups=replicate_groups,
            quantile=percent_strong_quantile,
        )
    elif operation == "precision_recall":
        metric_result = precision_recall(
            similarity_melted_df=similarity_melted_df,
            replicate_groups=replicate_groups,
            k=precision_recall_k,
        )
    elif operation == "grit":
        metric_result = grit(
            similarity_melted_df=similarity_melted_df,
            control_perts=grit_control_perts,
            replicate_id=replicate_groups["replicate_id"],
            group_id=replicate_groups["group_id"],
        )

    return metric_result
コード例 #5
0
df = pd.read_csv(example_file)
# Clean the dataframe for convenience
df.loc[(df["Metadata_moa"].isna()) & (df["Metadata_broad_sample"] == "DMSO"),
       "Metadata_moa", ] = "none"
df = df[~df["Metadata_moa"].isna()]

meta_features = [
    x for x in df.columns
    if (x.startswith("Metadata_") or x.startswith("Image_"))
]
features = df.drop(meta_features, axis="columns").columns.tolist()

similarity_melted_df = metric_melt(
    df=df,
    features=features,
    metadata_features=meta_features,
    similarity_metric="pearson",
    eval_metric="hitk",
)

# compute the normal index_list
replicate_group = ["Metadata_moa"]
groupby_columns = ["Metadata_broad_sample", "Metadata_Plate", "Metadata_Well"]
percent_list = [2, 5, 10, 100]

index_list, percent_results = hitk(
    similarity_melted_df=similarity_melted_df,
    replicate_groups=replicate_group,
    groupby_columns=groupby_columns,
    percent_list=percent_list,
)
コード例 #6
0
example_file = "SQ00015054_normalized_feature_select.csv.gz"
example_file = pathlib.Path(
    "{file}/../../example_data/compound/{eg}".format(
        file=os.path.dirname(__file__), eg=example_file
    )
)

df = pd.read_csv(example_file)

meta_features = [x for x in df.columns if x.startswith("Metadata_")]
features = df.drop(meta_features, axis="columns").columns.tolist()

similarity_melted_df = metric_melt(
    df=df,
    features=features,
    metadata_features=meta_features,
    similarity_metric="pearson",
)


def test_percent_strong():
    replicate_groups = ["Metadata_broad_sample", "Metadata_mg_per_ml"]
    output = percent_strong(
        similarity_melted_df=similarity_melted_df,
        replicate_groups=replicate_groups,
        quantile=0.95,
    )
    expected_result = 0.4583

    assert np.round(output, 4) == expected_result
コード例 #7
0
similarity_metric = "pearson"
operation = "percent_strong"

replicate_groups = [
    "Metadata_cell_line", "Metadata_gene_name", "Metadata_pert_name"
]

control_ids = ["Chr2", "Luc", "LacZ"]

# In[4]:

# Melt the input profiles to long format
similarity_melted_df = metric_melt(
    df=cell_health_df,
    features=features,
    metadata_features=meta_features,
    similarity_metric=similarity_metric,
    eval_metric=operation,
)

similarity_melted_df = assign_replicates(
    similarity_melted_df=similarity_melted_df,
    replicate_groups=replicate_groups)

print(similarity_melted_df.shape)
similarity_melted_df.head()

# In[5]:

non_replicate_cor_95th = (
    similarity_melted_df.query("not group_replicate").groupby(
コード例 #8
0
        figure_output_dir = os.path.join(figure_dir, batch, plate)
        os.makedirs(figure_output_dir, exist_ok=True)

        audit_output_file = os.path.join(audit_output_dir,
                                         "{}_audit.csv".format(plate))
        df = pd.read_csv(plate_files[plate])

        # Determine feature class
        features = infer_cp_features(df)
        meta_features = infer_cp_features(df, metadata=True)

        # Calculate and process pairwise similarity matrix
        audit_df = metric_melt(
            df=df,
            features=features,
            metadata_features=meta_features,
            similarity_metric="pearson",
            eval_metric="replicate_reproducibility",
        )

        audit_df = assign_replicates(similarity_melted_df=audit_df,
                                     replicate_groups=audit_cols)
        # What is 95% of the non replicate null distribution
        cutoff = audit_df.query(
            "not group_replicate").similarity_metric.quantile(0.95)

        # Calculate a single number for percent strong
        percent_strong = evaluate(
            profiles=df,
            features=features,
            meta_features=meta_features,
コード例 #9
0
example_file = "SQ00015054_normalized_feature_select.csv.gz"
example_file = pathlib.Path("{file}/../../example_data/compound/{eg}".format(
    file=os.path.dirname(__file__), eg=example_file))

df = pd.read_csv(example_file)
df = df.assign(Metadata_profile_id=[
    "Metadata_profile_{x}".format(x=x) for x in range(0, df.shape[0])
])

meta_features = [x for x in df.columns if x.startswith("Metadata_")]
features = df.drop(meta_features, axis="columns").columns.tolist()

similarity_melted_df = metric_melt(
    df=df,
    features=features,
    metadata_features=meta_features,
    similarity_metric="pearson",
)

similarity_melted_full_df = metric_melt(
    df=df,
    features=features,
    metadata_features=meta_features,
    similarity_metric="pearson",
    eval_metric="grit",
)


def test_assign_replicates():
    replicate_groups = ["Metadata_broad_sample", "Metadata_mg_per_ml"]
    result = assign_replicates(similarity_melted_df=similarity_melted_df,
コード例 #10
0
example_file = "SQ00014610_normalized_feature_select.csv.gz"
example_file = pathlib.Path("{file}/../../example_data/gene/{eg}".format(
    file=os.path.dirname(__file__), eg=example_file))

df = pd.read_csv(example_file)

meta_features = [
    x for x in df.columns
    if (x.startswith("Metadata_") or x.startswith("Image_"))
]
features = df.drop(meta_features, axis="columns").columns.tolist()

similarity_melted_df = metric_melt(
    df=df,
    features=features,
    metadata_features=meta_features,
    similarity_metric="pearson",
    eval_metric="precision_recall",
)

replicate_groups = ["Metadata_gene_name", "Metadata_cell_line"]

groupby_columns = ["Metadata_pert_name"]


def test_precision_recall():
    result_list = precision_recall(
        similarity_melted_df=similarity_melted_df,
        replicate_groups=replicate_groups,
        groupby_columns=groupby_columns,
        k=[5, 10],
コード例 #11
0
def evaluate(
    profiles: pd.DataFrame,
    features: List[str],
    meta_features: List[str],
    replicate_groups: Union[List[str], dict],
    operation: str = "replicate_reproducibility",
    similarity_metric: str = "pearson",
    replicate_reproducibility_quantile: float = 0.95,
    replicate_reproducibility_return_median_cor: bool = False,
    precision_recall_k: Union[int, List[int]] = 10,
    grit_control_perts: List[str] = ["None"],
    grit_replicate_summary_method: str = "mean",
    mp_value_params: dict = {},
    enrichment_percentile: Union[float, List[float]] = 0.99,
):
    r"""Evaluate profile quality and strength.

    For a given profile dataframe containing both metadata and feature measurement
    columns, use this function to calculate profile quality metrics. The function
    contains all the necessary arguments for specific evaluation operations.

    Parameters
    ----------
    profiles : pandas.DataFrame
        profiles must be a pandas DataFrame with profile samples as rows and profile
        features as columns. The columns should contain both metadata and feature
        measurements.
    features : list
        A list of strings corresponding to feature measurement column names in the
        `profiles` DataFrame. All features listed must be found in `profiles`.
    meta_features : list
        A list of strings corresponding to metadata column names in the `profiles`
        DataFrame. All features listed must be found in `profiles`.
    replicate_groups : {str, list, dict}
        An important variable indicating which metadata columns denote replicate
        information. All metric operations require replicate profiles.
        `replicate_groups` indicates a str or list of columns to use. For
        `operation="grit"`, `replicate_groups` is a dict with two keys: "profile_col"
        and "replicate_group_col". "profile_col" is the column name that stores
        identifiers for each profile (can be unique), while "replicate_group_col" is the
        column name indicating a higher order replicate information. E.g.
        "replicate_group_col" can be a gene column in a CRISPR experiment with multiple
        guides targeting the same genes. See also
        :py:func:`cytominer_eval.operations.grit` and
        :py:func:`cytominer_eval.transform.util.check_replicate_groups`.
    operation : {'replicate_reproducibility', 'precision_recall', 'grit', 'mp_value'}, optional
        The specific evaluation metric to calculate. The default is
        "replicate_reproducibility".
    similarity_metric: {'pearson', 'spearman', 'kendall'}, optional
        How to calculate pairwise similarity. Defaults to "pearson". We use the input
        in pandas.DataFrame.cor(). The default is "pearson".

    Returns
    -------
    float, pd.DataFrame
        The resulting evaluation metric. The return is either a single value or a pandas
        DataFrame summarizing the metric as specified in `operation`.

    Other Parameters
    -----------------------------
    replicate_reproducibility_quantile : {0.95, ...}, optional
        Only used when `operation='replicate_reproducibility'`. This indicates the
        percentile of the non-replicate pairwise similarity to consider a reproducible
        phenotype. Defaults to 0.95.
    replicate_reproducibility_return_median_cor : bool, optional
        Only used when `operation='replicate_reproducibility'`. If True, then also
        return pairwise correlations as defined by replicate_groups and
        similarity metric
    precision_recall_k : int or list of ints {10, ...}, optional
        Only used when `operation='precision_recall'`. Used to calculate precision and
        recall considering the top k profiles according to pairwise similarity.
    grit_control_perts : {None, ...}, optional
        Only used when `operation='grit'`. Specific profile identifiers used as a
        reference when calculating grit. The list entries must be found in the
        `replicate_groups[replicate_id]` column.
    grit_replicate_summary_method : {"mean", "median"}, optional
        Only used when `operation='grit'`. Defines how the replicate z scores are
        summarized. see
        :py:func:`cytominer_eval.operations.util.calculate_grit`
    mp_value_params : {{}, ...}, optional
        Only used when `operation='mp_value'`. A key, item pair of optional parameters
        for calculating mp value. See also
        :py:func:`cytominer_eval.operations.util.default_mp_value_parameters`
    enrichment_percentile : float or list of floats, optional
        Only used when `operation='enrichment'`. Determines the percentage of top connections
        used for the enrichment calculation.
    """
    # Check replicate groups input
    check_replicate_groups(eval_metric=operation,
                           replicate_groups=replicate_groups)

    if operation != "mp_value":
        # Melt the input profiles to long format
        similarity_melted_df = metric_melt(
            df=profiles,
            features=features,
            metadata_features=meta_features,
            similarity_metric=similarity_metric,
            eval_metric=operation,
        )

    # Perform the input operation
    if operation == "replicate_reproducibility":
        metric_result = replicate_reproducibility(
            similarity_melted_df=similarity_melted_df,
            replicate_groups=replicate_groups,
            quantile_over_null=replicate_reproducibility_quantile,
            return_median_correlations=
            replicate_reproducibility_return_median_cor,
        )
    elif operation == "precision_recall":
        metric_result = precision_recall(
            similarity_melted_df=similarity_melted_df,
            replicate_groups=replicate_groups,
            k=precision_recall_k,
        )
    elif operation == "grit":
        metric_result = grit(
            similarity_melted_df=similarity_melted_df,
            control_perts=grit_control_perts,
            profile_col=replicate_groups["profile_col"],
            replicate_group_col=replicate_groups["replicate_group_col"],
            replicate_summary_method=grit_replicate_summary_method,
        )
    elif operation == "mp_value":
        metric_result = mp_value(
            df=profiles,
            control_perts=grit_control_perts,
            replicate_id=replicate_groups,
            features=features,
            params=mp_value_params,
        )
    elif operation == "enrichment":
        metric_result = enrichment(
            similarity_melted_df=similarity_melted_df,
            replicate_groups=replicate_groups,
            percentile=enrichment_percentile,
        )

    return metric_result