def test_iv(): x = [1] y = [0] # The difference for a "mean" test is 1 - 0 = 1 # Through randomisation, we get the following pairs: # - [1] & [0] (50% of the time, diff = 1) # - [0] & [1] (50% of the time diff = -1) # Thus, 100% of the time we would expect a result at least as # extreme or lower than 1.0. f = "mean" n = 10000 side = "lower" result = mcpt.permutation_test(x, y, f, side, n=n, seed=3919) # Check the result is correct. assert 1 >= result.lower assert 1 <= result.upper
def test_vii(): # Testing that we obtain the correct result when cores are > 1. x0 = [10, 9, 11] y0 = [12, 11, 13] x = [10, 9, 11] y = [12, 11, 13] f = "mean" n = 10000 side = "lower" result = mcpt.permutation_test(x, y, f, side, n=n, cores=2, seed=4919) # Check the result is correct. assert 0.1 >= result.lower assert 0.1 <= result.upper # Check the inputs haven't changed as a result of function. assert y0 == y assert x0 == x
def test_i(): # From https://www.thoughtco.com/example-of-a-permutation-test-3997741 # One sided (lower) should be 0.1. x0 = [10, 9, 11] y0 = [12, 11, 13] x = [10, 9, 11] y = [12, 11, 13] f = "mean" n = 10000 side = "lower" result = mcpt.permutation_test(x, y, f, side, n=n, seed=3919) # Check the result is correct. assert 0.1 >= result.lower assert 0.1 <= result.upper # Check the inputs haven't changed as a result of function. assert x0 == x assert y0 == y
def test_xi(): # Pandas DataSeries integration with permutation_test. import pandas as pd import numpy as np a = [10, 9, 11] b = [12, 11, 13] side = "lower" a_df0 = pd.DataFrame(columns=["Change"], data=a) b_df0 = pd.DataFrame(columns=["Change"], data=b) a_df = pd.DataFrame(columns=["Change"], data=a) b_df = pd.DataFrame(columns=["Change"], data=b) result = mcpt.permutation_test( a_df["Change"], b_df["Change"], f="mean", side=side, seed=6919 ) # Check the result is correct. assert 0.1 >= result.lower assert 0.1 <= result.upper # Check the inputs haven't changed as a result of function. assert a_df0.equals(a_df) assert b_df0.equals(b_df)
def main(): files = [ f"results/{i}" for i in ( "2020_05_11_icrc.msgpack.gml", "2020_05_11_ecrc.msgpack.gml", "2020_05_11_phenol_hydrox.msgpack.gml", "2020_05_11_o2a.msgpack.gml", ) ] files.sort(key=lambda i: os.path.getsize(i)) for infile in files: logger.info(f"Reading GML file: {infile}") G = nx.read_gml(infile) true_edges = obtain_true_edges(G) print(f"There are {len(true_edges)} true edges") # G = shuffle_labels(G) cpds = [node for node in G.nodes if node.startswith("CPD")] seqs = [node for node in G.nodes if node.startswith("SEQ")] print(f"There are {len(cpds)} compounds") print(f"There are {len(seqs)} proteins") [G.add_edge(i, i, identity=1) for i in seqs] [G.add_edge(i, i, similarity=1) for i in cpds] logger.info("Generating compound clusters") cpd_clusters = { round(t, 1): run_wpgma( G, cpds, similarity_key="similarity", threshold=t, cluster_label="CPD_CLUSTER", ) for t in np.linspace(0, 1, 11) } logger.info("Generating protein clusters") seq_clusters = { round(t, 1): run_wpgma( G, seqs, similarity_key="identity", threshold=t, cluster_label="SEQ_CLUSTER", ) for t in np.linspace(0, 0.7, 8) } neg_dens = test_recovery_negative_edges(G, cpd_clusters, seq_clusters) pos_dens = test_recovery_true_edges( G, cpd_clusters, seq_clusters, true_edges=true_edges ) min_density = min( list(neg_dens.values()) + [i["log10 recovery density"] for i in pos_dens.values()] ) # Create a data frame. true_df = pd.DataFrame(pos_dens.values()) logger.warning(f"There are {len(true_df[true_df['log10 recovery density'] == np.log10(2)])} irrecoverable edges.") # ROC curve. thresholds = set(neg_dens.values()) thresholds = thresholds | set(true_df["log10 recovery density"]) thresholds = sorted(thresholds) x, y = [0], [0] vline = -float("inf") for threshold in thresholds: # True-positive rate is TP / (TP + FN) -- the proportion of truths correctly called true. tpr = len(true_df[ true_df["log10 recovery density"] <= threshold]) / len(true_df) # False-positive rate is FP / (TN + FP) -- the proportion of falsehoods incorrectly called true. fpr = sum(1 for v in neg_dens.values() if v <= threshold) / len(neg_dens) if threshold <= -1: vline = fpr y.append(tpr) x.append(fpr) x.append(1) y.append(1) auc = np.trapz(y, x=x) print(auc) plt.plot(x, y, marker=".", label="Recovery density") plt.plot([0,1], [0,1], linestyle="--", label="Random") if vline != -float("inf"): plt.axvline(vline, color="gray", linestyle="--", alpha=0.3) plt.xlabel("False positive rate") plt.ylabel("True positive rate") plt.title(f"Reciever operating characteristic curve comparing\nrecovery density to a random search\n(AUC={auc:.2f})\n") plt.legend() plt.tight_layout() plt.savefig(f"{infile.rsplit('.', 1)[0]}_ROC.png", dpi=1000) one_degree_protein = len(true_df[true_df["protein degree"] == 1]) one_degree_compound = len(true_df[true_df["compound degree"] == 1]) one_degree_both = len(true_df[(true_df["compound degree"] == 1) & (true_df["protein degree"] == 1)]) logger.info(f"There are {one_degree_protein} edges with protein degree == 1") logger.info(f"There are {one_degree_compound} edges with compound degree == 1") logger.info(f"There are {one_degree_both} edges with both compound and protein degree == 1") # There is no such thing as an "irrecoverable edge", because at worst, the # recovery density should be 0 (when thresholds are 0, and everything is # lumped together in one cluster on each side). # Make scatter plot of recovery density vs the closest compound of a pair. points = plt.scatter( true_df["closest compound similarity"], true_df["closest sequence similarity"], c=true_df["log10 recovery density"], cmap="Spectral", edgecolor='black', linewidth=0.1, marker=".", s=50, ) plt.xlim(-0.05, 1.05) plt.ylim(-0.05, 1.05) cbar = plt.colorbar(points) cbar.ax.set_ylabel("$log_{10}$ recovery density") plt.title( "Distribution of known edges, $C_n-P_n$,\n" "according to protein identity and compound similarity\n" ) plt.xlabel("Similarity with most similar compound to $C_n$") plt.ylabel("Identity with most similar protein to $P_n$") plt.tight_layout() plt.savefig(f"{infile.rsplit('.', 1)[0]}_Fig1.png", dpi=1000) plt.close() # Add an extra column that marks whether the compound degree is greater than one. true_df["compound degree > 1"] = np.where( true_df["compound degree"] > 1, "> 1", "1" ) true_df["protein degree > 1"] = np.where( true_df["protein degree"] > 1, "> 1", "1" ) # Make violin plot of the recovery density. sns.violinplot( x="compound degree > 1", y="log10 recovery density", data=true_df, bw=0.1, ) plt.title( "Violin plot of recovery densities by number of protein\n" "neighbours of compound, $C_n$, in known pairs, $C_n-P_n$\n" ) plt.xlabel("Number of protein neighbours of compound, $C_n$") plt.ylabel("$log_{10}$ recovery density") plt.tight_layout() plt.savefig(f"{infile.rsplit('.', 1)[0]}_Fig2.png", dpi=1000) plt.close() # Make violin plot of the recovery density. sns.violinplot( x="protein degree > 1", y="log10 recovery density", data=true_df, bw=0.1, ) plt.title( "Violin plot of recovery densities by number of compound\n" "neighbours of protein, $P_n$, in known pairs, $C_n-P_n$\n" ) plt.xlabel("Number of compound neighbours of protein, $P_n$") plt.ylabel("$log_{10}$ recovery density") plt.tight_layout() plt.savefig(f"{infile.rsplit('.', 1)[0]}_Fig3.png", dpi=1000) plt.close() # Make scatter plot. coef, p = spearmanr( true_df["compound degree"], true_df["log10 recovery density"] ) sns.scatterplot( x="compound degree", y="log10 recovery density", data=true_df, marker=".", s=6, ) log10_str = "$log_{10}$" plt.xlabel("Number of protein neighbours of compound, $C_n$") plt.ylabel("$log_{10}$ recovery density") plt.title( f"Scatter plot of {log10_str} recovery density\n" "by the number of protein neighbours of compound, $C_n$,\n" "in known pairs, $C_n-P_n$" ) props = dict(boxstyle="round", alpha=0.5, facecolor="white") ax = plt.gca() ax.text( 0.8, 0.95, f"$p$ = {coef:.2f}\np = {p:.2f}", bbox=props, transform=ax.transAxes, verticalalignment="top", fontsize=10, ) plt.tight_layout() plt.savefig(f"{infile.rsplit('.', 1)[0]}_Fig4.png", dpi=1000) plt.close() # Distribution plots. sns.distplot( [i for i in neg_dens.values()], label="Unknown edges", kde=False, bins=np.linspace(min_density, 0, 150), color="red" ) plt.xlabel("$log_{10}$ recovery density") plt.ylabel("Frequency") plt.title( "Histogram showing distributions\nof recovery density by edge type\n" ) plt.tight_layout() plt.legend() plt.savefig(f"{infile.rsplit('.', 1)[0]}_Fig5a.png", dpi=1000) plt.close() sns.distplot( [ i["log10 recovery density"] for _, i in true_df.iterrows() if not np.isnan(i["log10 recovery density"]) and i["compound degree"] == 1 ], label="Known, compound degree = 1", color="blue", kde=False, bins=np.linspace(min_density, 0, 150), ) sns.distplot( [ i["log10 recovery density"] for _, i in true_df.iterrows() if not np.isnan(i["log10 recovery density"]) and i["compound degree"] > 1 ], label="Known, compound degree > 1", color="green", kde=False, bins=np.linspace(min_density, 0, 150), ) plt.xlabel("$log_{10}$ recovery density") plt.ylabel("Frequency") plt.title( "Histogram showing distributions\nof recovery density by edge type\n" ) plt.tight_layout() plt.legend() plt.savefig(f"{infile.rsplit('.', 1)[0]}_Fig5b.png", dpi=1000) plt.close() logger.info("Recovery density comparisons of edges by compound degree:") logger.debug(f'Median recovery density for degree = 1: {np.median(true_df[true_df["compound degree"] == 1]["log10 recovery density"])}') logger.debug(f'Median recovery density for degree > 1: {np.median(true_df[true_df["compound degree"] > 1]["log10 recovery density"])}') logger.debug(f"Median recovery density for false edges: {np.median(list(neg_dens.values()))}") logger.debug("Degree == 1 vs negative edges") r = mcpt.permutation_test( list(neg_dens.values()), list( true_df[true_df["compound degree"] == 1]["log10 recovery density"] ), "median", side="greater", n=1_000, ) logger.debug(f"{r}") logger.debug("Degree > 1 vs negative edges") r = mcpt.permutation_test( list(neg_dens.values()), list( true_df[true_df["compound degree"] > 1]["log10 recovery density"] ), "median", side="greater", n=1_000, ) logger.debug(f"{r}") logger.debug("Degree > 1 vs Degree == 1") r = mcpt.permutation_test( list( true_df[true_df["compound degree"] == 1]["log10 recovery density"] ), list( true_df[true_df["compound degree"] > 1]["log10 recovery density"] ), "median", side="greater", n=1_000, ) logger.debug(f"{r}") # Make scatter plot. coef, p = spearmanr( true_df["protein degree"], true_df["log10 recovery density"] ) sns.scatterplot( x="protein degree", y="log10 recovery density", data=true_df ) log10_str = "$log_{10}$" plt.xlabel("Number of compound neighbours of protein, $P_n$") plt.ylabel("$log_{10}$ recovery density") plt.title( f"Scatter plot of {log10_str} recovery density\n" "by the number of compound neighbours of protein, $P_n$,\n" "in known pairs, $C_n-P_n$" ) props = dict(boxstyle="round", alpha=0.5, facecolor="white") ax = plt.gca() ax.text( 0.8, 0.95, f"$p$ = {coef:.2f}\np = {p:.2f}", bbox=props, transform=ax.transAxes, verticalalignment="top", fontsize=10, ) plt.tight_layout() plt.savefig(f"{infile.rsplit('.', 1)[0]}_Fig6.png", dpi=1000) plt.close() # Distribution plots. sns.distplot( [i for i in neg_dens.values()], label="Unknown edges", kde=False, bins=np.linspace(min_density, 0, 150), color="red", ) plt.xlabel("$log_{10}$ recovery density") plt.ylabel("Frequency") plt.title( "Histogram showing distributions\nof recovery density by edge type\n" ) plt.tight_layout() plt.legend() plt.savefig(f"{infile.rsplit('.', 1)[0]}_Fig7a.png", dpi=1000) plt.close() sns.distplot( [ i["log10 recovery density"] for _, i in true_df.iterrows() if not np.isnan(i["log10 recovery density"]) and i["protein degree"] == 1 ], label="Known, protein degree = 1", kde=False, bins=np.linspace(min_density, 0, 150), color="blue", ) sns.distplot( [ i["log10 recovery density"] for _, i in true_df.iterrows() if not np.isnan(i["log10 recovery density"]) and i["protein degree"] > 1 ], label="Known, protein degree > 1", kde=False, bins=np.linspace(min_density, 0, 150), color="green", ) plt.xlabel("$log_{10}$ recovery density") plt.ylabel("Frequency") plt.title( "Histogram showing distributions\nof recovery density by edge type\n" ) plt.tight_layout() plt.legend() plt.savefig(f"{infile.rsplit('.', 1)[0]}_Fig7b.png", dpi=1000) plt.close() logger.info("Recovery density comparisons of edges by protein degree:") logger.debug(f'Median recovery density for degree = 1: {np.median(true_df[true_df["protein degree"] == 1]["log10 recovery density"])}') logger.debug(f'Median recovery density for degree > 1: {np.median(true_df[true_df["protein degree"] > 1]["log10 recovery density"])}') logger.debug(f"Median recovery density for false edges: {np.median(list(neg_dens.values()))}") logger.debug("Degree == 1 vs negative edges") r = mcpt.permutation_test( list(neg_dens.values()), list( true_df[true_df["protein degree"] == 1]["log10 recovery density"] ), "median", side="greater", n=1_000, ) logger.debug(f"{r}") logger.debug("Degree > 1 vs negative edges") r = mcpt.permutation_test( list(neg_dens.values()), list(true_df[true_df["protein degree"] > 1]["log10 recovery density"]), "median", side="greater", n=1_000, ) logger.debug(f"{r}") logger.debug("Degree > 1 vs Degree == 1") r = mcpt.permutation_test( list( true_df[true_df["protein degree"] == 1]["log10 recovery density"] ), list(true_df[true_df["protein degree"] > 1]["log10 recovery density"]), "median", side="greater", n=1_000, ) logger.debug(f"{r}") logger.info("Done!")
def test_ix(): # Test that seeding works. x0 = [-2.31, 1.06, 0.76, 1.38, -0.26, 1.29, -1.31, 0.41, -0.67, -0.58] y0 = [-1.08, 1.03, 0.90, 0.24, -0.24, 0.76, -0.57, -0.05, -1.28, 1.04] x = [-2.31, 1.06, 0.76, 1.38, -0.26, 1.29, -1.31, 0.41, -0.67, -0.58] y = [-1.08, 1.03, 0.90, 0.24, -0.24, 0.76, -0.57, -0.05, -1.28, 1.04] seed = 4919 n = 10000 for side in ["greater", "lower", "both"]: for cores in [1, 2]: # Run two tests with the same seed. result_a = mcpt.permutation_test( x, y, "mean", side, n=n, cores=cores, seed=seed ) result_b = mcpt.permutation_test( x, y, "mean", side, n=n, cores=cores, seed=seed ) # Check that the seeded results are equivalent. assert result_a == result_b # Run up to ten unseeded permutations and ensure that it differs from the seeded. for _ in range(10): result_c = mcpt.permutation_test(x, y, "mean", side, n=n, cores=cores) if result_a != result_c: break else: raise Exception("result_a always identical to result_c") # Run up to ten unseeded permutation tests and ensure that it differes from # the previously unseeded result. for _ in range(10): result_d = mcpt.permutation_test(x, y, "mean", side, n=n, cores=cores) if result_c != result_d: break else: raise Exception("result_c always identical to result_d") # Check the inputs haven't changed as a result of function. assert x0 == x assert y0 == y # Run two tests with the same seed. result_a = mcpt.correlation_permutation_test( x, y, "pearsonr", side, n=n, cores=cores, seed=seed ) result_b = mcpt.correlation_permutation_test( x, y, "pearsonr", side, n=n, cores=cores, seed=seed ) # Check that the seeded results are equivalent. assert result_a == result_b # Run up to ten unseeded permutations and ensure that it differs from the seeded. for _ in range(10): result_c = mcpt.correlation_permutation_test( x, y, "pearsonr", side, n=n, cores=cores ) if result_a != result_c: break else: raise Exception("result_a always identical to result_c") # Run up to ten unseeded permutation tests and ensure that it differes from # the previously unseeded result. for _ in range(10): result_d = mcpt.correlation_permutation_test( x, y, "pearsonr", side, n=n, cores=cores ) if result_c != result_d: break else: raise Exception("result_c always identical to result_d") # Check the inputs haven't changed as a result of function. assert x0 == x assert y0 == y