コード例 #1
0
def test_set_pair_ids():
    pair_a = "pair_a"
    pair_b = "pair_b"

    result = set_pair_ids()

    assert result[pair_a]["index"] == "{pair_a}_index".format(pair_a=pair_a)
    assert result[pair_a]["index"] == "{pair_a}_index".format(pair_a=pair_a)
    assert result[pair_b]["suffix"] == "_{pair_b}".format(pair_b=pair_b)
    assert result[pair_b]["suffix"] == "_{pair_b}".format(pair_b=pair_b)
コード例 #2
0
def assign_replicates(
    similarity_melted_df: pd.DataFrame,
    replicate_groups: List[str],
) -> pd.DataFrame:
    """
    Arguments:
    similarity_melted_df - a long pandas dataframe output from transform.metric_melt
    replicate_groups - a list of metadata column names in the original profile dataframe
                       to use as replicate columns

    Output:
    Adds columns to the similarity metric dataframe to indicate whether or not the
    pairwise similarity metric is comparing replicates or not
    """
    pair_ids = set_pair_ids()
    replicate_col_names = {
        x: "{x}_replicate".format(x=x)
        for x in replicate_groups
    }

    compare_dfs = []
    for replicate_col in replicate_groups:
        replicate_cols_with_suffix = [
            "{col}{suf}".format(col=replicate_col, suf=pair_ids[x]["suffix"])
            for x in pair_ids
        ]

        assert all([
            x in similarity_melted_df.columns
            for x in replicate_cols_with_suffix
        ]), "replicate_group not found in melted dataframe columns"

        replicate_col_name = replicate_col_names[replicate_col]

        compare_df = similarity_melted_df.loc[:, replicate_cols_with_suffix]
        compare_df.loc[:, replicate_col_name] = False

        compare_df.loc[np.where(compare_df.iloc[:,
                                                0] == compare_df.iloc[:,
                                                                      1])[0],
                       replicate_col_name, ] = True
        compare_dfs.append(compare_df)

    compare_df = pd.concat(compare_dfs, axis="columns").reset_index(drop=True)
    compare_df = compare_df.assign(
        group_replicate=compare_df.loc[:, replicate_col_names.values()].min(
            axis="columns")).loc[:,
                                 list(replicate_col_names.values()) +
                                 ["group_replicate"]]

    similarity_melted_df = similarity_melted_df.merge(compare_df,
                                                      left_index=True,
                                                      right_index=True)
    return similarity_melted_df
コード例 #3
0
def test_calculate_precision_recall():
    similarity_melted_df = metric_melt(
        df=df,
        features=features,
        metadata_features=meta_features,
        similarity_metric="pearson",
        eval_metric="precision_recall",
    )

    replicate_groups = ["Metadata_broad_sample"]
    result = assign_replicates(similarity_melted_df=similarity_melted_df,
                               replicate_groups=replicate_groups).sort_values(
                                   by="similarity_metric", ascending=False)

    pair_ids = set_pair_ids()
    replicate_group_cols = [
        "{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"])
        for x in replicate_groups
    ]

    example_group = result.groupby(replicate_group_cols).get_group(
        name=("BRD-A38592941-001-02-7"))

    assert example_group.shape[
        0] == 383 * 6  # number of pairwise comparisons per dose

    # Assert that the similarity metrics are sorted
    assert (example_group.similarity_metric.diff().dropna() > 0).sum() == 0

    # Perform the calculation!
    result = pd.DataFrame(calculate_precision_recall(example_group, k=10),
                          columns=["result"])

    expected_result = {"k": 10, "precision": 0.4, "recall": 0.1333}
    expected_result = pd.DataFrame(expected_result,
                                   index=["result"]).transpose()

    assert_frame_equal(result, expected_result, check_less_precise=True)

    # Check that recall is 1 when k is maximized
    result = pd.DataFrame(
        calculate_precision_recall(example_group, k=example_group.shape[0]),
        columns=["result"],
    )

    assert result.loc["recall", "result"] == 1
コード例 #4
0
def precision_recall(
    similarity_melted_df: pd.DataFrame,
    replicate_groups: List[str],
    k: int,
) -> pd.DataFrame:
    """
    Determine the precision and recall at k for all unique replicate groups
    based on a predefined similarity metric (see cytominer_eval.transform.metric_melt)

    Arguments:
    similarity_melted_df - a long pandas dataframe output from transform.metric_melt
    replicate_groups - a list of metadata column names in the original profile dataframe
                       to use as replicate columns
    k - an integer indicating how many pairwise comparisons to threshold

    Output:
    pandas DataFrame of precision and recall metrics for all replicate groups
    """

    # Determine pairwise replicates and make sure to sort based on the metric!
    similarity_melted_df = assign_replicates(
        similarity_melted_df=similarity_melted_df,
        replicate_groups=replicate_groups).sort_values(by="similarity_metric",
                                                       ascending=False)

    # Check to make sure that the melted dataframe is full
    assert_melt(similarity_melted_df, eval_metric="precision_recall")

    # Extract out specific columns
    pair_ids = set_pair_ids()
    replicate_group_cols = [
        "{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"])
        for x in replicate_groups
    ]

    # Calculate precision and recall for all groups
    precision_recall_df = similarity_melted_df.groupby(
        replicate_group_cols).apply(
            lambda x: calculate_precision_recall(x, k=k))

    # Rename the columns back to the replicate groups provided
    rename_cols = dict(zip(replicate_group_cols, replicate_groups))

    return precision_recall_df.reset_index().rename(rename_cols,
                                                    axis="columns")
コード例 #5
0
def grit(
    similarity_melted_df: pd.DataFrame,
    control_perts: List[str],
    replicate_id: str,
    group_id: str,
) -> pd.DataFrame:
    """
    Calculate grit

    Arguments:
    similarity_melted_df - a long pandas dataframe output from transform.metric_melt
    control_perts - a list of control perturbations to calculate a null distribution
    replicate_id - the metadata identifier marking which column tracks replicate perts
    group_id - the metadata identifier marking which column tracks a higher order groups
               for all perturbations

    Output:
    A dataframe of grit measurements per perturbation
    """
    # Determine pairwise replicates
    similarity_melted_df = assign_replicates(
        similarity_melted_df=similarity_melted_df,
        replicate_groups=[replicate_id, group_id],
    )

    # Check to make sure that the melted dataframe is full
    assert_melt(similarity_melted_df, eval_metric="grit")

    # Extract out specific columns
    pair_ids = set_pair_ids()
    replicate_col_name = "{x}{suf}".format(
        x=replicate_id, suf=pair_ids[list(pair_ids)[0]]["suffix"])

    # Define the columns to use in the calculation
    column_id_info = set_grit_column_info(replicate_id=replicate_id,
                                          group_id=group_id)

    # Calculate grit for each perturbation
    grit_df = (similarity_melted_df.groupby(
        replicate_col_name).apply(lambda x: calculate_grit(
            x, control_perts, column_id_info)).reset_index(drop=True))

    return grit_df
コード例 #6
0
]
features = df.drop(meta_features, axis="columns").columns.tolist()

similarity_melted_df = metric_melt(
    df=df,
    features=features,
    metadata_features=meta_features,
    similarity_metric="pearson",
    eval_metric="grit",
)

control_perts = ["Luc-2", "LacZ-2", "LacZ-3"]
replicate_id = "Metadata_pert_name"
group_id = "Metadata_gene_name"

pair_ids = set_pair_ids()
replicate_col_name = "{x}{suf}".format(
    x=replicate_id, suf=pair_ids[list(pair_ids)[0]]["suffix"])

column_id_info = set_grit_column_info(replicate_id=replicate_id,
                                      group_id=group_id)


def test_get_grit_entry():
    with pytest.raises(AssertionError) as ae:
        result = get_grit_entry(df=similarity_melted_df,
                                col=replicate_col_name)
    assert "grit is calculated for each perturbation independently" in str(
        ae.value)

    expected_result = "EMPTY"