Esempio n. 1
0
def test_iv():
    x = [1]
    y = [0]
    # The difference for a "mean" test is 1 - 0 = 1
    # Through randomisation, we get the following pairs:
    #   - [1] & [0] (50% of the time, diff = 1)
    #   - [0] & [1] (50% of the time diff = -1)
    # Thus, 100% of the time we would expect a result at least as
    # extreme or lower than 1.0.

    f = "mean"
    n = 10000
    side = "lower"
    result = mcpt.permutation_test(x, y, f, side, n=n, seed=3919)
    # Check the result is correct.
    assert 1 >= result.lower
    assert 1 <= result.upper
Esempio n. 2
0
def test_vii():
    # Testing that we obtain the correct result when cores are > 1.
    x0 = [10, 9, 11]
    y0 = [12, 11, 13]

    x = [10, 9, 11]
    y = [12, 11, 13]

    f = "mean"
    n = 10000
    side = "lower"

    result = mcpt.permutation_test(x, y, f, side, n=n, cores=2, seed=4919)

    # Check the result is correct.
    assert 0.1 >= result.lower
    assert 0.1 <= result.upper
    # Check the inputs haven't changed as a result of function.
    assert y0 == y
    assert x0 == x
Esempio n. 3
0
def test_i():
    # From https://www.thoughtco.com/example-of-a-permutation-test-3997741
    # One sided (lower) should be 0.1.
    x0 = [10, 9, 11]
    y0 = [12, 11, 13]

    x = [10, 9, 11]
    y = [12, 11, 13]

    f = "mean"
    n = 10000
    side = "lower"

    result = mcpt.permutation_test(x, y, f, side, n=n, seed=3919)
    # Check the result is correct.
    assert 0.1 >= result.lower
    assert 0.1 <= result.upper

    # Check the inputs haven't changed as a result of function.
    assert x0 == x
    assert y0 == y
Esempio n. 4
0
def test_xi():
    # Pandas DataSeries integration with permutation_test.
    import pandas as pd
    import numpy as np

    a = [10, 9, 11]
    b = [12, 11, 13]
    side = "lower"

    a_df0 = pd.DataFrame(columns=["Change"], data=a)
    b_df0 = pd.DataFrame(columns=["Change"], data=b)

    a_df = pd.DataFrame(columns=["Change"], data=a)
    b_df = pd.DataFrame(columns=["Change"], data=b)

    result = mcpt.permutation_test(
        a_df["Change"], b_df["Change"], f="mean", side=side, seed=6919
    )
    # Check the result is correct.
    assert 0.1 >= result.lower
    assert 0.1 <= result.upper
    # Check the inputs haven't changed as a result of function.
    assert a_df0.equals(a_df)
    assert b_df0.equals(b_df)
Esempio n. 5
0
def main():
    files = [
        f"results/{i}"
        for i in (
            "2020_05_11_icrc.msgpack.gml",
            "2020_05_11_ecrc.msgpack.gml",
            "2020_05_11_phenol_hydrox.msgpack.gml",
            "2020_05_11_o2a.msgpack.gml",
        )
    ]
    files.sort(key=lambda i: os.path.getsize(i))

    for infile in files:
        logger.info(f"Reading GML file: {infile}")

        G = nx.read_gml(infile)
        true_edges = obtain_true_edges(G)
        print(f"There are {len(true_edges)} true edges")
        # G = shuffle_labels(G)

        cpds = [node for node in G.nodes if node.startswith("CPD")]
        seqs = [node for node in G.nodes if node.startswith("SEQ")]
        print(f"There are {len(cpds)} compounds")
        print(f"There are {len(seqs)} proteins")

        [G.add_edge(i, i, identity=1) for i in seqs]
        [G.add_edge(i, i, similarity=1) for i in cpds]

        logger.info("Generating compound clusters")
        cpd_clusters = {
            round(t, 1): run_wpgma(
                G,
                cpds,
                similarity_key="similarity",
                threshold=t,
                cluster_label="CPD_CLUSTER",
            )
            for t in np.linspace(0, 1, 11)
        }

        logger.info("Generating protein clusters")
        seq_clusters = {
            round(t, 1): run_wpgma(
                G,
                seqs,
                similarity_key="identity",
                threshold=t,
                cluster_label="SEQ_CLUSTER",
            )
            for t in np.linspace(0, 0.7, 8)
        }

        neg_dens = test_recovery_negative_edges(G, cpd_clusters, seq_clusters)
        pos_dens = test_recovery_true_edges(
            G, cpd_clusters, seq_clusters, true_edges=true_edges
        )

        min_density = min(
            list(neg_dens.values()) +
            [i["log10 recovery density"] for i in pos_dens.values()]
        )

        # Create a data frame.
        true_df = pd.DataFrame(pos_dens.values())
        logger.warning(f"There are {len(true_df[true_df['log10 recovery density'] == np.log10(2)])} irrecoverable edges.")

        # ROC curve.
        thresholds = set(neg_dens.values())
        thresholds = thresholds | set(true_df["log10 recovery density"])
        thresholds = sorted(thresholds)

        x, y = [0], [0]
        vline = -float("inf")

        for threshold in thresholds:
            # True-positive rate is TP / (TP + FN) -- the proportion of truths correctly called true.
            tpr = len(true_df[ true_df["log10 recovery density"] <= threshold]) / len(true_df)
            # False-positive rate is FP / (TN + FP) -- the proportion of falsehoods incorrectly called true.
            fpr = sum(1 for v in neg_dens.values() if v <= threshold) / len(neg_dens)
            if threshold <= -1:
                vline = fpr
            y.append(tpr)
            x.append(fpr)
        x.append(1)
        y.append(1)

        auc = np.trapz(y, x=x)
        print(auc)

        plt.plot(x, y, marker=".", label="Recovery density")
        plt.plot([0,1], [0,1], linestyle="--", label="Random")
        if vline != -float("inf"):
            plt.axvline(vline, color="gray", linestyle="--", alpha=0.3)
        plt.xlabel("False positive rate")
        plt.ylabel("True positive rate")
        plt.title(f"Reciever operating characteristic curve comparing\nrecovery density to a random search\n(AUC={auc:.2f})\n")
        plt.legend()
        plt.tight_layout()
        plt.savefig(f"{infile.rsplit('.', 1)[0]}_ROC.png", dpi=1000)


        one_degree_protein = len(true_df[true_df["protein degree"] == 1])
        one_degree_compound = len(true_df[true_df["compound degree"] == 1])
        one_degree_both = len(true_df[(true_df["compound degree"] == 1) & (true_df["protein degree"] == 1)])
        logger.info(f"There are {one_degree_protein} edges with protein degree == 1")
        logger.info(f"There are {one_degree_compound} edges with compound degree == 1")
        logger.info(f"There are {one_degree_both} edges with both compound and protein degree == 1")

        # There is no such thing as an "irrecoverable edge", because at worst, the
        # recovery density should be 0 (when thresholds are 0, and everything is
        # lumped together in one cluster on each side).

        # Make scatter plot of recovery density vs the closest compound of a pair.
        points = plt.scatter(
            true_df["closest compound similarity"],
            true_df["closest sequence similarity"],
            c=true_df["log10 recovery density"],
            cmap="Spectral",
            edgecolor='black',
            linewidth=0.1,
            marker=".",
            s=50,
        )
        plt.xlim(-0.05, 1.05)
        plt.ylim(-0.05, 1.05)

        cbar = plt.colorbar(points)
        cbar.ax.set_ylabel("$log_{10}$ recovery density")

        plt.title(
            "Distribution of known edges, $C_n-P_n$,\n"
            "according to protein identity and compound similarity\n"
        )
        plt.xlabel("Similarity with most similar compound to $C_n$")
        plt.ylabel("Identity with most similar protein to $P_n$")
        plt.tight_layout()
        plt.savefig(f"{infile.rsplit('.', 1)[0]}_Fig1.png", dpi=1000)
        plt.close()

        # Add an extra column that marks whether the compound degree is greater than one.
        true_df["compound degree > 1"] = np.where(
            true_df["compound degree"] > 1, "> 1", "1"
        )
        true_df["protein degree > 1"] = np.where(
            true_df["protein degree"] > 1, "> 1", "1"
        )


        # Make violin plot of the recovery density.
        sns.violinplot(
            x="compound degree > 1",
            y="log10 recovery density",
            data=true_df,
            bw=0.1,
        )
        plt.title(
            "Violin plot of recovery densities by number of protein\n"
            "neighbours of compound, $C_n$, in known pairs, $C_n-P_n$\n"
        )
        plt.xlabel("Number of protein neighbours of compound, $C_n$")
        plt.ylabel("$log_{10}$ recovery density")
        plt.tight_layout()
        plt.savefig(f"{infile.rsplit('.', 1)[0]}_Fig2.png", dpi=1000)
        plt.close()

        # Make violin plot of the recovery density.
        sns.violinplot(
            x="protein degree > 1",
            y="log10 recovery density",
            data=true_df,
            bw=0.1,
        )
        plt.title(
            "Violin plot of recovery densities by number of compound\n"
            "neighbours of protein, $P_n$, in known pairs, $C_n-P_n$\n"
        )
        plt.xlabel("Number of compound neighbours of protein, $P_n$")
        plt.ylabel("$log_{10}$ recovery density")
        plt.tight_layout()
        plt.savefig(f"{infile.rsplit('.', 1)[0]}_Fig3.png", dpi=1000)
        plt.close()

        # Make scatter plot.
        coef, p = spearmanr(
            true_df["compound degree"], true_df["log10 recovery density"]
        )
        sns.scatterplot(
            x="compound degree",
            y="log10 recovery density",
            data=true_df,
            marker=".",
            s=6,
        )
        log10_str = "$log_{10}$"
        plt.xlabel("Number of protein neighbours of compound, $C_n$")
        plt.ylabel("$log_{10}$ recovery density")

        plt.title(
            f"Scatter plot of {log10_str} recovery density\n"
            "by the number of protein neighbours of compound, $C_n$,\n"
            "in known pairs, $C_n-P_n$"
        )
        props = dict(boxstyle="round", alpha=0.5, facecolor="white")
        ax = plt.gca()
        ax.text(
            0.8,
            0.95,
            f"$p$ = {coef:.2f}\np = {p:.2f}",
            bbox=props,
            transform=ax.transAxes,
            verticalalignment="top",
            fontsize=10,
        )

        plt.tight_layout()
        plt.savefig(f"{infile.rsplit('.', 1)[0]}_Fig4.png", dpi=1000)
        plt.close()

        # Distribution plots.
        sns.distplot(
            [i for i in neg_dens.values()],
            label="Unknown edges",
            kde=False, bins=np.linspace(min_density, 0, 150),
            color="red"
        )

        plt.xlabel("$log_{10}$ recovery density")
        plt.ylabel("Frequency")
        plt.title(
            "Histogram showing distributions\nof recovery density by edge type\n"
        )
        plt.tight_layout()
        plt.legend()
        plt.savefig(f"{infile.rsplit('.', 1)[0]}_Fig5a.png", dpi=1000)
        plt.close()

        sns.distplot(
            [
                i["log10 recovery density"]
                for _, i in true_df.iterrows()
                if not np.isnan(i["log10 recovery density"])
                and i["compound degree"] == 1
            ],
            label="Known, compound degree = 1",
            color="blue",
            kde=False, bins=np.linspace(min_density, 0, 150),
        )

        sns.distplot(
            [
                i["log10 recovery density"]
                for _, i in true_df.iterrows()
                if not np.isnan(i["log10 recovery density"])
                and i["compound degree"] > 1
            ],
            label="Known, compound degree > 1",
            color="green",
            kde=False, bins=np.linspace(min_density, 0, 150),
        )
        plt.xlabel("$log_{10}$ recovery density")
        plt.ylabel("Frequency")
        plt.title(
            "Histogram showing distributions\nof recovery density by edge type\n"
        )
        plt.tight_layout()
        plt.legend()
        plt.savefig(f"{infile.rsplit('.', 1)[0]}_Fig5b.png", dpi=1000)
        plt.close()

        logger.info("Recovery density comparisons of edges by compound degree:")

        logger.debug(f'Median recovery density for degree = 1: {np.median(true_df[true_df["compound degree"] == 1]["log10 recovery density"])}')
        logger.debug(f'Median recovery density for degree > 1: {np.median(true_df[true_df["compound degree"] > 1]["log10 recovery density"])}')
        logger.debug(f"Median recovery density for false edges: {np.median(list(neg_dens.values()))}")

        logger.debug("Degree == 1 vs negative edges")
        r = mcpt.permutation_test(
            list(neg_dens.values()),
            list(
                true_df[true_df["compound degree"] == 1]["log10 recovery density"]
            ),
            "median",
            side="greater",
            n=1_000,
        )
        logger.debug(f"{r}")

        logger.debug("Degree > 1 vs negative edges")
        r = mcpt.permutation_test(
            list(neg_dens.values()),
            list(
                true_df[true_df["compound degree"] > 1]["log10 recovery density"]
            ),
            "median",
            side="greater",
            n=1_000,
        )
        logger.debug(f"{r}")

        logger.debug("Degree > 1 vs Degree == 1")
        r = mcpt.permutation_test(
            list(
                true_df[true_df["compound degree"] == 1]["log10 recovery density"]
            ),
            list(
                true_df[true_df["compound degree"] > 1]["log10 recovery density"]
            ),
            "median",
            side="greater",
            n=1_000,
        )
        logger.debug(f"{r}")

        # Make scatter plot.
        coef, p = spearmanr(
            true_df["protein degree"], true_df["log10 recovery density"]
        )
        sns.scatterplot(
            x="protein degree", y="log10 recovery density", data=true_df
        )

        log10_str = "$log_{10}$"
        plt.xlabel("Number of compound neighbours of protein, $P_n$")
        plt.ylabel("$log_{10}$ recovery density")
        plt.title(
            f"Scatter plot of {log10_str} recovery density\n"
            "by the number of compound neighbours of protein, $P_n$,\n"
            "in known pairs, $C_n-P_n$"
        )
        props = dict(boxstyle="round", alpha=0.5, facecolor="white")
        ax = plt.gca()
        ax.text(
            0.8,
            0.95,
            f"$p$ = {coef:.2f}\np = {p:.2f}",
            bbox=props,
            transform=ax.transAxes,
            verticalalignment="top",
            fontsize=10,
        )
        plt.tight_layout()
        plt.savefig(f"{infile.rsplit('.', 1)[0]}_Fig6.png", dpi=1000)
        plt.close()

        # Distribution plots.
        sns.distplot(
            [i for i in neg_dens.values()],
            label="Unknown edges",
            kde=False, bins=np.linspace(min_density, 0, 150),
            color="red",
        )

        plt.xlabel("$log_{10}$ recovery density")
        plt.ylabel("Frequency")
        plt.title(
            "Histogram showing distributions\nof recovery density by edge type\n"
        )
        plt.tight_layout()
        plt.legend()
        plt.savefig(f"{infile.rsplit('.', 1)[0]}_Fig7a.png", dpi=1000)
        plt.close()

        sns.distplot(
            [
                i["log10 recovery density"]
                for _, i in true_df.iterrows()
                if not np.isnan(i["log10 recovery density"])
                and i["protein degree"] == 1
            ],
            label="Known, protein degree = 1",
            kde=False, bins=np.linspace(min_density, 0, 150),
            color="blue",
        )

        sns.distplot(
            [
                i["log10 recovery density"]
                for _, i in true_df.iterrows()
                if not np.isnan(i["log10 recovery density"])
                and i["protein degree"] > 1
            ],
            label="Known, protein degree > 1",
            kde=False, bins=np.linspace(min_density, 0, 150),
            color="green",
        )

        plt.xlabel("$log_{10}$ recovery density")
        plt.ylabel("Frequency")
        plt.title(
            "Histogram showing distributions\nof recovery density by edge type\n"
        )
        plt.tight_layout()
        plt.legend()
        plt.savefig(f"{infile.rsplit('.', 1)[0]}_Fig7b.png", dpi=1000)
        plt.close()

        logger.info("Recovery density comparisons of edges by protein degree:")

        logger.debug(f'Median recovery density for degree = 1: {np.median(true_df[true_df["protein degree"] == 1]["log10 recovery density"])}')
        logger.debug(f'Median recovery density for degree > 1: {np.median(true_df[true_df["protein degree"] > 1]["log10 recovery density"])}')
        logger.debug(f"Median recovery density for false edges: {np.median(list(neg_dens.values()))}")

        logger.debug("Degree == 1 vs negative edges")
        r = mcpt.permutation_test(
            list(neg_dens.values()),
            list(
                true_df[true_df["protein degree"] == 1]["log10 recovery density"]
            ),
            "median",
            side="greater",
            n=1_000,
        )
        logger.debug(f"{r}")

        logger.debug("Degree > 1 vs negative edges")
        r = mcpt.permutation_test(
            list(neg_dens.values()),
            list(true_df[true_df["protein degree"] > 1]["log10 recovery density"]),
            "median",
            side="greater",
            n=1_000,
        )
        logger.debug(f"{r}")

        logger.debug("Degree > 1 vs Degree == 1")
        r = mcpt.permutation_test(
            list(
                true_df[true_df["protein degree"] == 1]["log10 recovery density"]
            ),
            list(true_df[true_df["protein degree"] > 1]["log10 recovery density"]),
            "median",
            side="greater",
            n=1_000,
        )
        logger.debug(f"{r}")
        logger.info("Done!")
Esempio n. 6
0
def test_ix():
    # Test that seeding works.
    x0 = [-2.31, 1.06, 0.76, 1.38, -0.26, 1.29, -1.31, 0.41, -0.67, -0.58]
    y0 = [-1.08, 1.03, 0.90, 0.24, -0.24, 0.76, -0.57, -0.05, -1.28, 1.04]

    x = [-2.31, 1.06, 0.76, 1.38, -0.26, 1.29, -1.31, 0.41, -0.67, -0.58]
    y = [-1.08, 1.03, 0.90, 0.24, -0.24, 0.76, -0.57, -0.05, -1.28, 1.04]

    seed = 4919
    n = 10000
    for side in ["greater", "lower", "both"]:
        for cores in [1, 2]:
            # Run two tests with the same seed.
            result_a = mcpt.permutation_test(
                x, y, "mean", side, n=n, cores=cores, seed=seed
            )
            result_b = mcpt.permutation_test(
                x, y, "mean", side, n=n, cores=cores, seed=seed
            )
            # Check that the seeded results are equivalent.
            assert result_a == result_b

            # Run up to ten unseeded permutations and ensure that it differs from the seeded.
            for _ in range(10):
                result_c = mcpt.permutation_test(x, y, "mean", side, n=n, cores=cores)

                if result_a != result_c:
                    break
            else:
                raise Exception("result_a always identical to result_c")

            # Run up to ten unseeded permutation tests and ensure that it differes from
            # the previously unseeded result.
            for _ in range(10):
                result_d = mcpt.permutation_test(x, y, "mean", side, n=n, cores=cores)
                if result_c != result_d:
                    break
            else:
                raise Exception("result_c always identical to result_d")

            # Check the inputs haven't changed as a result of function.
            assert x0 == x
            assert y0 == y

            # Run two tests with the same seed.
            result_a = mcpt.correlation_permutation_test(
                x, y, "pearsonr", side, n=n, cores=cores, seed=seed
            )
            result_b = mcpt.correlation_permutation_test(
                x, y, "pearsonr", side, n=n, cores=cores, seed=seed
            )
            # Check that the seeded results are equivalent.
            assert result_a == result_b

            # Run up to ten unseeded permutations and ensure that it differs from the seeded.
            for _ in range(10):
                result_c = mcpt.correlation_permutation_test(
                    x, y, "pearsonr", side, n=n, cores=cores
                )
                if result_a != result_c:
                    break
            else:
                raise Exception("result_a always identical to result_c")

            # Run up to ten unseeded permutation tests and ensure that it differes from
            # the previously unseeded result.
            for _ in range(10):
                result_d = mcpt.correlation_permutation_test(
                    x, y, "pearsonr", side, n=n, cores=cores
                )
                if result_c != result_d:
                    break
            else:
                raise Exception("result_c always identical to result_d")

            # Check the inputs haven't changed as a result of function.
            assert x0 == x
            assert y0 == y