def test_paired_two_sample_transform(): np.random.seed(1234) constant = 0.3 # case 1: paired data paired_X = np.random.normal(0, 1, 1000).reshape(-1, 1) paired_Y = paired_X + constant # use MGC to perform independence test on "unpaired" data u, v = paired_two_sample_transform(paired_X, paired_Y) mgc = MGC() p_value, p_value_metadata = mgc.p_value(u, v, is_fast=True) print(p_value, p_value_metadata) # assert np.allclose(p_value, 1.0, atol=0.1) # case 2: unpaired data unpaired_X = np.random.normal(0, 1, 1000).reshape(-1, 1) unpaired_Y = np.random.normal(constant, 1, 1000).reshape(-1, 1) # use MGC to perform independence test on "unpaired" data u, v = paired_two_sample_transform(unpaired_X, unpaired_Y) mgc = MGC() p_value, p_value_metadata = mgc.p_value(u, v, is_fast=True) print(p_value, p_value_metadata)
def test_mgc_test_all(): data_dir = "./mgcpy/independence_tests/unit_tests/mgc/data/" simulations = ["linear_sim", "exp_sim", "cub_sim", "joint_sim", "step_sim", "quad_sim", "w_sim", "spiral_sim", "ubern_sim", "log_sim", "root_sim", "sin_sim", "sin_sim_16", "square_sim", "two_parab_sim", "circle_sim", "ellipsis_sim", "square_sim_", "multi_noise_sim", "multi_indep_sim"] print("\nSimulations being used to test MGC: ") for simulation in simulations: print(simulation) X = np.genfromtxt(data_dir + "input/" + simulation + "_x.csv", delimiter=',').reshape(-1, 1) Y = np.genfromtxt(data_dir + "input/" + simulation + "_y.csv", delimiter=',').reshape(-1, 1) if simulation == "step_sim": mgc_results = np.genfromtxt(data_dir + "mgc/" + simulation + "_res.csv", delimiter=',')[1:] pMGC = mgc_results[:, 0][0] statMGC = mgc_results[:, 1][0] # pLocalCorr = mgc_results[:, 2:4] localCorr = mgc_results[:, 4:6] optimalScale = mgc_results[:, 6:8][0] else: pMGC, statMGC, _, localCorr, optimalScale = load_results(simulation + "_res.csv") mgc = MGC() p_value, metadata = mgc.p_value(X, Y) assert np.allclose(statMGC, metadata["test_statistic"]) assert np.allclose(localCorr, metadata["local_correlation_matrix"]) assert np.allclose(optimalScale, metadata["optimal_scale"]) assert np.allclose(pMGC, p_value, atol=0.1)
def test_mgc_test_all(): data_dir = "./mgcpy/independence_tests/unit_tests/mgc/data/" simulations = ["linear_sim", "exp_sim", "cub_sim", "joint_sim", "step_sim", "quad_sim", "w_sim", "spiral_sim", "ubern_sim", "log_sim", "root_sim", "sin_sim", "sin_sim_16", "square_sim", "two_parab_sim", "circle_sim", "ellipsis_sim", "square_sim_", "multi_noise_sim", "multi_indep_sim"] print("\nSimulations being used to test MGC: ") for simulation in simulations: np.random.seed(1234) # to ensure same permutation results are produced print(simulation) X = np.genfromtxt(data_dir + "input/" + simulation + "_x.csv", delimiter=',').reshape(-1, 1) Y = np.genfromtxt(data_dir + "input/" + simulation + "_y.csv", delimiter=',').reshape(-1, 1) if simulation == "step_sim": mgc_results = np.genfromtxt(data_dir + "fast_mgc/" + simulation + "_fast_res.csv", delimiter=',') pMGC = mgc_results[:, 0][0] statMGC = mgc_results[:, 1][0] localCorr = mgc_results[:, 2:4] optimalScale = np.array(np.unravel_index(int(mgc_results[:, 4][0])-1, (50, 50))) + 1 # add 1 to match Matlab indexing else: pMGC, statMGC, localCorr, optimalScale = load_results(simulation + "_fast_res.csv") mgc = MGC() p_value, metadata = mgc.p_value(X, Y, is_fast=True) assert np.allclose(pMGC, p_value, atol=1.e-2) assert np.allclose(statMGC, metadata["test_statistic"], rtol=1.e-4) assert np.allclose(localCorr, metadata["local_correlation_matrix"], rtol=1.e-4) assert np.allclose(optimalScale, metadata["optimal_scale"])
def test_fast_mgc_test_linear(): # linear (mgc.sims.linear(50, 1, 0.1)) X = np.array([ 0.45233912, 0.41776082, 0.08992314, -0.68255391, -0.65492209, 0.24839759, -0.87725133, 0.32595345, -0.08646498, -0.16638085, 0.26394850, 0.72925148, 0.26028888, -0.59854218, -0.80068479, -0.69199885, 0.14915159, 0.37115868, 0.96039213, 0.27498675, -0.01372958, -0.89370963, 0.78395670, -0.42157105, -0.13893970, 0.50943310, -0.12623322, -0.20255325, 0.18437355, -0.02945578, 0.78082317, 0.39372362, -0.37730187, -0.17078540, 0.70129955, 0.83651364, 0.73375401, -0.34883304, 0.15323405, 0.51493599, -0.24317493, 0.83948953, 0.77216592, 0.90045095, -0.53736592, -0.88430486, 0.31447365, 0.66595322, -0.15917153, -0.38190466 ]).reshape(-1, 1) Y = np.array([ 0.665986696, 0.402397835, 0.134445492, -0.796653997, -0.636592886, 0.277283128, -0.636847542, 0.249515282, -0.149871134, -0.147567403, 0.369251601, 0.687118553, 0.524448340, -0.585999355, -0.858549573, -0.756081985, 0.129307735, 0.180976113, 0.874637167, 0.458794276, -0.003339139, -0.967879037, 0.758180626, -0.392856219, -0.114772505, 0.425345845, -0.069794980, -0.330857932, 0.229331072, 0.058739766, 0.777801029, 0.580715974, -0.231521102, -0.233366160, 0.669360658, 0.999785556, 0.648315305, -0.321119155, 0.156810807, 0.451349979, -0.393285002, 0.720164611, 0.811149183, 0.936183880, -0.587798720, -0.721394055, 0.233671350, 0.625407903, -0.154576153, -0.451475001 ]).reshape(-1, 1) p_value = 1 / 1000 mgc = MGC() p_value_res, _ = mgc.p_value(X, Y, is_fast=True) assert np.allclose(p_value, p_value_res, rtol=0.1)
def test_fast_mgc_test_non_linear(): # spiral data (mgc.sims.spiral(50, 1, 0.5)) np.random.seed(1234) # to ensure same permutation results are produced X = np.array([ -0.915363905, 2.134736725, 1.591825890, -0.947720469, -0.629203447, 0.157367412, -3.009624669, 0.342083914, 0.126834696, 2.009228424, 0.137638139, -4.168139174, 1.854371040, 1.696600346, -2.454855196, 1.770009913, -0.080973938, 1.985722698, 0.671279564, 1.521294941, -0.905490998, -1.043388333, 0.006493876, 4.007326886, 1.755316427, -0.905436337, 0.497332481, 0.819071238, 3.561837453, 3.713293152, 0.487967353, 1.233385955, -2.985033861, 0.146394829, -2.231330093, -0.138580101, -2.390685794, -2.798259311, 0.647199716, -0.626705094, -0.254107788, 2.017131291, -2.871050739, -0.369874190, 0.198565130, 2.021387946, -2.877629992, -1.855015175, -0.201316471, 3.886001079 ]).reshape(-1, 1) Y = np.array([ 0.12441532, -2.63498763, 2.18349959, -0.58779997, -1.58602656, 0.35894756, -0.73954299, 1.76585591, -0.35002851, 0.48618590, 0.95628300, 1.99038991, 1.92277498, 1.34861841, 1.42509605, 0.65982368, -1.56731299, -0.17000082, 1.81187432, -0.73726241, 0.44491111, 0.19177688, 2.28190181, 0.45509215, -0.16777206, 0.06918430, -1.49570722, 2.23337087, -1.01335025, -0.60394315, -0.56653502, -3.12571299, -1.56146565, 0.52487563, 2.35561329, -1.79300788, -2.40650123, 0.53680541, 2.04171052, 0.09821259, -0.42712911, 0.52453433, -1.44426759, -2.22697039, 1.26906442, -0.13549404, 0.36776719, -2.44674330, 1.34647206, 2.14525574 ]).reshape(-1, 1) p_value = 0.9742 mgc = MGC() p_value_res, _ = mgc.p_value(X, Y, is_fast=True) assert np.allclose(p_value, p_value_res, atol=1.e-4)
def test_k_sample(): np.random.seed(1234) # prepare data salary_data = pd.read_csv("./mgcpy/hypothesis_tests/salary_data.csv") # 2 sample case men_salaries = salary_data.loc[salary_data['Gender'] == "M"]["Current Annual Salary"].values women_salaries = salary_data.loc[salary_data['Gender'] == "F"]["Current Annual Salary"].values u, v = k_sample_transform(np.random.choice(men_salaries, 1000), np.random.choice(women_salaries, 1000)) mgc = MGC() p_value, p_value_metadata = mgc.p_value(u, v, is_fast=True) assert np.allclose(p_value, 0.0, atol=0.01) # k sample case salaries = salary_data["Current Annual Salary"].values department_labels = salary_data["Department"].values.reshape(-1, 1) u, v = k_sample_transform(salaries[:100], department_labels[:100], is_y_categorical=True) mgc = MGC() p_value, p_value_metadata = mgc.p_value(u, v) assert np.allclose(p_value, 0.0, atol=0.01) # 2 sample case (H_0 is valid) # generate 100 samples from the same distribution (x = np.random.randn(100)) x = np.array([0.34270011, 1.30064541, -0.41888945, 1.40367111, 0.31901975, -1.83695735, -0.70370144, 0.89338428, 0.86047303, -0.98841287, 0.78325279, 0.55864254, 0.33317265, 2.22286831, -0.22349382, 0.40376754, -1.05356267, 0.54994568, -1.39765046, 0.41427267, -0.24457334, 0.2464725, -0.32179342, -1.77106008, -0.52824522, 1.57839019, -1.66455582, -0.97663735, -0.55176702, -1.95347702, 1.01934119, 1.05765468, -0.69941067, -1.12479123, 0.85236935, -0.77356459, 0.30217738, 0.95246919, -0.61210025, 1.09253269, 0.13576324, 0.62642456, 0.1859519, 0.32209166, 1.98633424, -0.57271182, 1.18247811, 2.05352048, -0.28297455, 0.25754106, 0.80790087, -0.26995007, 1.8223331, -1.80151834, 0.71496981, -0.5119113, -1.45558062, 1.24115387, 1.44295579, -0.24726018, -2.07078337, 1.90810404, -1.36892494, -0.39004086, 1.35998082, 1.50891149, -1.29257757, 0.05513461, -1.58889596, 0.48703248, 0.83443891, 0.46234541, 2.20457643, 1.47884097, -0.05316384, 0.72591566, 0.14339927, -1.29137912, 0.07908333, 0.80684167, 0.22417797, 0.45241074, -1.03024521, 0.6615743, 0.27216365, 2.4188678, 0.20561134, 0.71095061, -1.02478312, 0.54512964, 0.16582386, -0.39648338, -0.77905918, -0.33196771, 0.69407125, -0.81484451, 3.01568098, -0.49053868, -0.60987204, 1.72967348]) # assign half of them as samples from 1 and the other half as samples from 2 y = np.concatenate([np.repeat(1, 50), np.repeat(2, 50)], axis=0).reshape(-1, 1) u, v = k_sample_transform(x, y, is_y_categorical=True) mgc = MGC() p_value, p_value_metadata = mgc.p_value(u, v) assert np.allclose(p_value, 0.819, atol=0.1)
def compute_mgc(X, Y): mgc = MGC() mgc_statistic, independence_test_metadata = mgc.test_statistic(X, Y) p_value, metadata = mgc.p_value(X, Y) # mgc_statistic determines the correlation between the variables. # so, if the variable is closer to -1 or 1, and not extremely close to # 0, then there might be a correlation. If the p-value (the uncertainty, # from the way I see it) is lower than 0.05 or 5%, then there might be a # correlation. Make sure you are running the unbiased version of the method, # otherwise you might be lead to false results. print("MGC test statistic:", mgc_statistic) print("P Value:", p_value) #print("Optimal Scale:", independence_test_metadata["optimal_scale"]) return mgc_statistic, p_value, independence_test_metadata
# -*- coding: utf-8 -*- """ Created on Fri Feb 15 12:13:46 2019 @author: sandhya """ test = 'ACE' import numpy as np from mgcpy.independence_tests.mgc import MGC import matplotlib.pyplot as plt; plt.style.use('classic') import seaborn as sns; sns.set() atlases = ('JHU','aal','brodmann','CPAC200','desikan','DK','HarOxCort','HarOxSub','hemispheric','pp264','tissue') results = np.zeros([1,11]) i = 0 for atlas in atlases: #atlas = 'tissue' X = np.load('Xintersect_c_'+atlas+'.npy') Y = np.load('Yintersect_c_'+atlas+'.npy') mgc = MGC() mgc_statistic, independence_test_metadata = mgc.test_statistic(X, Y) p_value, metadata = mgc.p_value(X, Y) print(atlas) print("MGC test statistic:", mgc_statistic) print("P Value:", p_value) results[0,i]=p_value i+=1 np.save(test+'_results',results)