def test_assert_melt():
    for metric in ["precision_recall", "percent_strong", "grit"]:
        result = metric_melt(
            df=df,
            features=features,
            metadata_features=meta_features,
            similarity_metric="pearson",
            eval_metric=metric,
        )

        result = assign_replicates(similarity_melted_df=result,
                                   replicate_groups=replicate_groups)

        assert_melt(result, eval_metric=metric)

        # Note, not all alternative dummy metrics are provided, since many require
        # the same melted dataframe
        if metric == "precision_recall":
            dummy_metrics = ["percent_strong"]
        elif metric == "percent_strong":
            dummy_metrics = ["precision_recall", "grit"]
        elif metric == "grit":
            dummy_metrics = ["percent_strong"]

        for dummy_metric in dummy_metrics:
            with pytest.raises(AssertionError) as ve:
                output = assert_melt(result, eval_metric=dummy_metric)
            assert (
                "Stop! The eval_metric provided in 'metric_melt()' is incorrect!"
                in str(ve.value))
Beispiel #2
0
def test_assign_replicates():
    replicate_groups = ["Metadata_broad_sample", "Metadata_mg_per_ml"]
    result = assign_replicates(similarity_melted_df=similarity_melted_df,
                               replicate_groups=replicate_groups)

    expected_cols = ["{x}_replicate".format(x=x)
                     for x in replicate_groups] + ["group_replicate"]

    # Other functions expect columns to exist
    assert all([x in result.columns.tolist() for x in expected_cols])

    # Given the example data, we expect a certain number of pairwise replicates
    expected_replicates = list(result.loc[:, expected_cols].sum().values)
    assert expected_replicates == [1248, 408, 408]

    # Try with a different number of replicate groups
    replicate_groups = [
        "Metadata_broad_sample",
        "Metadata_mg_per_ml",
        "Metadata_plate_map_name",
    ]
    result = assign_replicates(similarity_melted_df=similarity_melted_df,
                               replicate_groups=replicate_groups)

    expected_cols = ["{x}_replicate".format(x=x)
                     for x in replicate_groups] + ["group_replicate"]

    # Other functions expect columns to exist
    assert all([x in result.columns.tolist() for x in expected_cols])

    # Given the example data, we expect a certain number of pairwise replicates
    expected_replicates = list(result.loc[:, expected_cols].sum().values)
    assert expected_replicates == [1248, 408, 73536, 408]

    # This function will fail if a replicate column is given that doesn't belong
    with pytest.raises(AssertionError) as ve:
        replicate_groups = ["MISSING_COLUMN"]
        result = assign_replicates(similarity_melted_df=similarity_melted_df,
                                   replicate_groups=replicate_groups)
    assert "replicate_group not found in melted dataframe columns" in str(
        ve.value)
Beispiel #3
0
def test_calculate_precision_recall():
    similarity_melted_df = metric_melt(
        df=df,
        features=features,
        metadata_features=meta_features,
        similarity_metric="pearson",
        eval_metric="precision_recall",
    )

    replicate_groups = ["Metadata_broad_sample"]
    result = assign_replicates(similarity_melted_df=similarity_melted_df,
                               replicate_groups=replicate_groups).sort_values(
                                   by="similarity_metric", ascending=False)

    pair_ids = set_pair_ids()
    replicate_group_cols = [
        "{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"])
        for x in replicate_groups
    ]

    example_group = result.groupby(replicate_group_cols).get_group(
        name=("BRD-A38592941-001-02-7"))

    assert example_group.shape[
        0] == 383 * 6  # number of pairwise comparisons per dose

    # Assert that the similarity metrics are sorted
    assert (example_group.similarity_metric.diff().dropna() > 0).sum() == 0

    # Perform the calculation!
    result = pd.DataFrame(calculate_precision_recall(example_group, k=10),
                          columns=["result"])

    expected_result = {"k": 10, "precision": 0.4, "recall": 0.1333}
    expected_result = pd.DataFrame(expected_result,
                                   index=["result"]).transpose()

    assert_frame_equal(result, expected_result, check_less_precise=True)

    # Check that recall is 1 when k is maximized
    result = pd.DataFrame(
        calculate_precision_recall(example_group, k=example_group.shape[0]),
        columns=["result"],
    )

    assert result.loc["recall", "result"] == 1
Beispiel #4
0
control_ids = ["Chr2", "Luc", "LacZ"]

# In[4]:

# Melt the input profiles to long format
similarity_melted_df = metric_melt(
    df=cell_health_df,
    features=features,
    metadata_features=meta_features,
    similarity_metric=similarity_metric,
    eval_metric=operation,
)

similarity_melted_df = assign_replicates(
    similarity_melted_df=similarity_melted_df,
    replicate_groups=replicate_groups)

print(similarity_melted_df.shape)
similarity_melted_df.head()

# In[5]:

non_replicate_cor_95th = (
    similarity_melted_df.query("not group_replicate").groupby(
        "Metadata_cell_line_pair_a")["similarity_metric"].quantile(
            0.95).reset_index().rename(
                {"Metadata_cell_line_pair_a": "cell_line"}, axis="columns"))

# Output results
output_file = pathlib.Path(
        df = pd.read_csv(plate_files[plate])

        # Determine feature class
        features = infer_cp_features(df)
        meta_features = infer_cp_features(df, metadata=True)

        # Calculate and process pairwise similarity matrix
        audit_df = metric_melt(
            df=df,
            features=features,
            metadata_features=meta_features,
            similarity_metric="pearson",
            eval_metric="replicate_reproducibility",
        )

        audit_df = assign_replicates(similarity_melted_df=audit_df,
                                     replicate_groups=audit_cols)
        # What is 95% of the non replicate null distribution
        cutoff = audit_df.query(
            "not group_replicate").similarity_metric.quantile(0.95)

        # Calculate a single number for percent strong
        percent_strong = evaluate(
            profiles=df,
            features=features,
            meta_features=meta_features,
            replicate_groups=audit_cols,
            operation="replicate_reproducibility",
            similarity_metric="pearson",
            replicate_reproducibility_quantile=0.95,
        )
Beispiel #6
0
def test_calculate_grit():
    result = assign_replicates(
        similarity_melted_df=similarity_melted_df,
        replicate_groups=[replicate_id, group_id],
    )

    assert_melt(result, eval_metric="grit")

    example_group = result.groupby(replicate_col_name).get_group(
        name=("MTOR-2"))

    # Perform the calculation!
    grit_result = pd.DataFrame(
        calculate_grit(example_group,
                       control_perts=control_perts,
                       column_id_info=column_id_info),
        columns=["result"],
    )

    expected_result = {
        "perturbation": "MTOR-2",
        "group": "MTOR",
        "grit": 1.55075
    }
    expected_result = pd.DataFrame(expected_result,
                                   index=["result"]).transpose()

    assert_frame_equal(grit_result, expected_result, check_less_precise=True)

    # Calculate grit will not work with singleton perturbations
    # (no other perts in same group)
    example_group = result.groupby(replicate_col_name).get_group(
        name=("AURKB-2"))

    grit_result = pd.DataFrame(
        calculate_grit(example_group,
                       control_perts=control_perts,
                       column_id_info=column_id_info),
        columns=["result"],
    )

    expected_result = {
        "perturbation": "AURKB-2",
        "group": "AURKB",
        "grit": np.nan
    }
    expected_result = pd.DataFrame(expected_result,
                                   index=["result"]).transpose()

    assert_frame_equal(grit_result, expected_result, check_less_precise=True)

    # Calculate grit will not work with the full dataframe
    with pytest.raises(AssertionError) as ae:
        result = calculate_grit(
            similarity_melted_df,
            control_perts=control_perts,
            column_id_info=column_id_info,
        )
    assert "grit is calculated for each perturbation independently" in str(
        ae.value)

    # Calculate grit will not work with when control barcodes are missing
    with pytest.raises(AssertionError) as ae:
        result = calculate_grit(
            example_group,
            control_perts=["DOES NOT EXIST", "THIS ONE NEITHER"],
            column_id_info=column_id_info,
        )
    assert "Error! No control perturbations found." in str(ae.value)