Exemple #1
0
def test_paired_two_sample_transform():
    np.random.seed(1234)
    constant = 0.3

    # case 1: paired data
    paired_X = np.random.normal(0, 1, 1000).reshape(-1, 1)
    paired_Y = paired_X + constant

    # use MGC to perform independence test on "unpaired" data
    u, v = paired_two_sample_transform(paired_X, paired_Y)
    mgc = MGC()
    p_value, p_value_metadata = mgc.p_value(u, v, is_fast=True)

    print(p_value, p_value_metadata)
    # assert np.allclose(p_value, 1.0, atol=0.1)

    # case 2: unpaired data
    unpaired_X = np.random.normal(0, 1, 1000).reshape(-1, 1)
    unpaired_Y = np.random.normal(constant, 1, 1000).reshape(-1, 1)

    # use MGC to perform independence test on "unpaired" data
    u, v = paired_two_sample_transform(unpaired_X, unpaired_Y)
    mgc = MGC()
    p_value, p_value_metadata = mgc.p_value(u, v, is_fast=True)

    print(p_value, p_value_metadata)
Exemple #2
0
def test_mgc_test_all():
    data_dir = "./mgcpy/independence_tests/unit_tests/mgc/data/"
    simulations = ["linear_sim", "exp_sim", "cub_sim", "joint_sim", "step_sim",
                   "quad_sim", "w_sim", "spiral_sim", "ubern_sim", "log_sim", "root_sim",
                   "sin_sim", "sin_sim_16", "square_sim", "two_parab_sim", "circle_sim",
                   "ellipsis_sim", "square_sim_", "multi_noise_sim", "multi_indep_sim"]

    print("\nSimulations being used to test MGC: ")
    for simulation in simulations:
        print(simulation)

        X = np.genfromtxt(data_dir + "input/" + simulation + "_x.csv", delimiter=',').reshape(-1, 1)
        Y = np.genfromtxt(data_dir + "input/" + simulation + "_y.csv", delimiter=',').reshape(-1, 1)

        if simulation == "step_sim":
            mgc_results = np.genfromtxt(data_dir + "mgc/" + simulation + "_res.csv", delimiter=',')[1:]
            pMGC = mgc_results[:, 0][0]
            statMGC = mgc_results[:, 1][0]
            # pLocalCorr = mgc_results[:, 2:4]
            localCorr = mgc_results[:, 4:6]
            optimalScale = mgc_results[:, 6:8][0]
        else:
            pMGC, statMGC, _, localCorr, optimalScale = load_results(simulation + "_res.csv")

        mgc = MGC()
        p_value, metadata = mgc.p_value(X, Y)

        assert np.allclose(statMGC, metadata["test_statistic"])
        assert np.allclose(localCorr, metadata["local_correlation_matrix"])
        assert np.allclose(optimalScale, metadata["optimal_scale"])
        assert np.allclose(pMGC, p_value, atol=0.1)
Exemple #3
0
def test_mgc_test_all():
    data_dir = "./mgcpy/independence_tests/unit_tests/mgc/data/"

    simulations = ["linear_sim", "exp_sim", "cub_sim", "joint_sim", "step_sim",
                   "quad_sim", "w_sim", "spiral_sim", "ubern_sim", "log_sim", "root_sim",
                   "sin_sim", "sin_sim_16", "square_sim", "two_parab_sim", "circle_sim",
                   "ellipsis_sim", "square_sim_", "multi_noise_sim", "multi_indep_sim"]

    print("\nSimulations being used to test MGC: ")
    for simulation in simulations:
        np.random.seed(1234)  # to ensure same permutation results are produced
        print(simulation)

        X = np.genfromtxt(data_dir + "input/" + simulation + "_x.csv", delimiter=',').reshape(-1, 1)
        Y = np.genfromtxt(data_dir + "input/" + simulation + "_y.csv", delimiter=',').reshape(-1, 1)

        if simulation == "step_sim":
            mgc_results = np.genfromtxt(data_dir + "fast_mgc/" + simulation + "_fast_res.csv", delimiter=',')
            pMGC = mgc_results[:, 0][0]
            statMGC = mgc_results[:, 1][0]
            localCorr = mgc_results[:, 2:4]
            optimalScale = np.array(np.unravel_index(int(mgc_results[:, 4][0])-1, (50, 50))) + 1  # add 1 to match Matlab indexing
        else:
            pMGC, statMGC, localCorr, optimalScale = load_results(simulation + "_fast_res.csv")

        mgc = MGC()
        p_value, metadata = mgc.p_value(X, Y, is_fast=True)

        assert np.allclose(pMGC, p_value, atol=1.e-2)
        assert np.allclose(statMGC, metadata["test_statistic"], rtol=1.e-4)
        assert np.allclose(localCorr, metadata["local_correlation_matrix"], rtol=1.e-4)
        assert np.allclose(optimalScale, metadata["optimal_scale"])
Exemple #4
0
def test_fast_mgc_test_linear():
    # linear (mgc.sims.linear(50, 1, 0.1))
    X = np.array([
        0.45233912, 0.41776082, 0.08992314, -0.68255391, -0.65492209,
        0.24839759, -0.87725133, 0.32595345, -0.08646498, -0.16638085,
        0.26394850, 0.72925148, 0.26028888, -0.59854218, -0.80068479,
        -0.69199885, 0.14915159, 0.37115868, 0.96039213, 0.27498675,
        -0.01372958, -0.89370963, 0.78395670, -0.42157105, -0.13893970,
        0.50943310, -0.12623322, -0.20255325, 0.18437355, -0.02945578,
        0.78082317, 0.39372362, -0.37730187, -0.17078540, 0.70129955,
        0.83651364, 0.73375401, -0.34883304, 0.15323405, 0.51493599,
        -0.24317493, 0.83948953, 0.77216592, 0.90045095, -0.53736592,
        -0.88430486, 0.31447365, 0.66595322, -0.15917153, -0.38190466
    ]).reshape(-1, 1)
    Y = np.array([
        0.665986696, 0.402397835, 0.134445492, -0.796653997, -0.636592886,
        0.277283128, -0.636847542, 0.249515282, -0.149871134, -0.147567403,
        0.369251601, 0.687118553, 0.524448340, -0.585999355, -0.858549573,
        -0.756081985, 0.129307735, 0.180976113, 0.874637167, 0.458794276,
        -0.003339139, -0.967879037, 0.758180626, -0.392856219, -0.114772505,
        0.425345845, -0.069794980, -0.330857932, 0.229331072, 0.058739766,
        0.777801029, 0.580715974, -0.231521102, -0.233366160, 0.669360658,
        0.999785556, 0.648315305, -0.321119155, 0.156810807, 0.451349979,
        -0.393285002, 0.720164611, 0.811149183, 0.936183880, -0.587798720,
        -0.721394055, 0.233671350, 0.625407903, -0.154576153, -0.451475001
    ]).reshape(-1, 1)

    p_value = 1 / 1000

    mgc = MGC()
    p_value_res, _ = mgc.p_value(X, Y, is_fast=True)
    assert np.allclose(p_value, p_value_res, rtol=0.1)
Exemple #5
0
def test_fast_mgc_test_non_linear():
    # spiral data (mgc.sims.spiral(50, 1, 0.5))
    np.random.seed(1234)  # to ensure same permutation results are produced

    X = np.array([
        -0.915363905, 2.134736725, 1.591825890, -0.947720469, -0.629203447,
        0.157367412, -3.009624669, 0.342083914, 0.126834696, 2.009228424,
        0.137638139, -4.168139174, 1.854371040, 1.696600346, -2.454855196,
        1.770009913, -0.080973938, 1.985722698, 0.671279564, 1.521294941,
        -0.905490998, -1.043388333, 0.006493876, 4.007326886, 1.755316427,
        -0.905436337, 0.497332481, 0.819071238, 3.561837453, 3.713293152,
        0.487967353, 1.233385955, -2.985033861, 0.146394829, -2.231330093,
        -0.138580101, -2.390685794, -2.798259311, 0.647199716, -0.626705094,
        -0.254107788, 2.017131291, -2.871050739, -0.369874190, 0.198565130,
        2.021387946, -2.877629992, -1.855015175, -0.201316471, 3.886001079
    ]).reshape(-1, 1)
    Y = np.array([
        0.12441532, -2.63498763, 2.18349959, -0.58779997, -1.58602656,
        0.35894756, -0.73954299, 1.76585591, -0.35002851, 0.48618590,
        0.95628300, 1.99038991, 1.92277498, 1.34861841, 1.42509605, 0.65982368,
        -1.56731299, -0.17000082, 1.81187432, -0.73726241, 0.44491111,
        0.19177688, 2.28190181, 0.45509215, -0.16777206, 0.06918430,
        -1.49570722, 2.23337087, -1.01335025, -0.60394315, -0.56653502,
        -3.12571299, -1.56146565, 0.52487563, 2.35561329, -1.79300788,
        -2.40650123, 0.53680541, 2.04171052, 0.09821259, -0.42712911,
        0.52453433, -1.44426759, -2.22697039, 1.26906442, -0.13549404,
        0.36776719, -2.44674330, 1.34647206, 2.14525574
    ]).reshape(-1, 1)

    p_value = 0.9742

    mgc = MGC()
    p_value_res, _ = mgc.p_value(X, Y, is_fast=True)
    assert np.allclose(p_value, p_value_res, atol=1.e-4)
Exemple #6
0
def test_k_sample():
    np.random.seed(1234)

    # prepare data
    salary_data = pd.read_csv("./mgcpy/hypothesis_tests/salary_data.csv")

    # 2 sample case
    men_salaries = salary_data.loc[salary_data['Gender'] == "M"]["Current Annual Salary"].values
    women_salaries = salary_data.loc[salary_data['Gender'] == "F"]["Current Annual Salary"].values
    u, v = k_sample_transform(np.random.choice(men_salaries, 1000), np.random.choice(women_salaries, 1000))
    mgc = MGC()
    p_value, p_value_metadata = mgc.p_value(u, v, is_fast=True)
    assert np.allclose(p_value, 0.0, atol=0.01)

    # k sample case
    salaries = salary_data["Current Annual Salary"].values
    department_labels = salary_data["Department"].values.reshape(-1, 1)
    u, v = k_sample_transform(salaries[:100], department_labels[:100], is_y_categorical=True)
    mgc = MGC()
    p_value, p_value_metadata = mgc.p_value(u, v)
    assert np.allclose(p_value, 0.0, atol=0.01)

    # 2 sample case (H_0 is valid)

    # generate 100 samples from the same distribution (x = np.random.randn(100))
    x = np.array([0.34270011,  1.30064541, -0.41888945,  1.40367111,  0.31901975, -1.83695735, -0.70370144,  0.89338428,  0.86047303, -0.98841287,
                  0.78325279,  0.55864254,  0.33317265,  2.22286831, -0.22349382, 0.40376754, -1.05356267,  0.54994568, -1.39765046,  0.41427267,
                  -0.24457334,  0.2464725, -0.32179342, -1.77106008, -0.52824522, 1.57839019, -1.66455582, -0.97663735, -0.55176702, -1.95347702,
                  1.01934119,  1.05765468, -0.69941067, -1.12479123,  0.85236935, -0.77356459,  0.30217738,  0.95246919, -0.61210025,  1.09253269,
                  0.13576324,  0.62642456,  0.1859519,  0.32209166,  1.98633424, -0.57271182,  1.18247811,  2.05352048, -0.28297455,  0.25754106,
                  0.80790087, -0.26995007,  1.8223331, -1.80151834,  0.71496981, -0.5119113, -1.45558062,  1.24115387,  1.44295579, -0.24726018,
                  -2.07078337,  1.90810404, -1.36892494, -0.39004086,  1.35998082, 1.50891149, -1.29257757,  0.05513461, -1.58889596,  0.48703248,
                  0.83443891,  0.46234541,  2.20457643,  1.47884097, -0.05316384, 0.72591566,  0.14339927, -1.29137912,  0.07908333,  0.80684167,
                  0.22417797,  0.45241074, -1.03024521,  0.6615743,  0.27216365, 2.4188678,  0.20561134,  0.71095061, -1.02478312,  0.54512964,
                  0.16582386, -0.39648338, -0.77905918, -0.33196771,  0.69407125, -0.81484451,  3.01568098, -0.49053868, -0.60987204,  1.72967348])
    # assign half of them as samples from 1 and the other half as samples from 2
    y = np.concatenate([np.repeat(1, 50), np.repeat(2, 50)], axis=0).reshape(-1, 1)

    u, v = k_sample_transform(x, y, is_y_categorical=True)
    mgc = MGC()
    p_value, p_value_metadata = mgc.p_value(u, v)
    assert np.allclose(p_value, 0.819, atol=0.1)
Exemple #7
0
def compute_mgc(X, Y):
    mgc = MGC()
    mgc_statistic, independence_test_metadata = mgc.test_statistic(X, Y)
    p_value, metadata = mgc.p_value(X, Y)

    # mgc_statistic determines the correlation between the variables.
    # so, if the variable is closer to -1 or 1, and not extremely close to
    # 0, then there might be a correlation. If the p-value (the uncertainty,
    # from the way I see it) is lower than 0.05 or 5%, then there might be a
    # correlation. Make sure you are running the unbiased version of the method,
    # otherwise you might be lead to false results.
    print("MGC test statistic:", mgc_statistic)
    print("P Value:", p_value)
    #print("Optimal Scale:", independence_test_metadata["optimal_scale"])
    return mgc_statistic, p_value, independence_test_metadata
Exemple #8
0
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 15 12:13:46 2019

@author: sandhya
"""
test = 'ACE'
import numpy as np
from mgcpy.independence_tests.mgc import MGC
import matplotlib.pyplot as plt; plt.style.use('classic')
import seaborn as sns; sns.set()
atlases = ('JHU','aal','brodmann','CPAC200','desikan','DK','HarOxCort','HarOxSub','hemispheric','pp264','tissue')
results = np.zeros([1,11])
i = 0

for atlas in atlases:
    
        #atlas = 'tissue'
    X = np.load('Xintersect_c_'+atlas+'.npy')
    Y = np.load('Yintersect_c_'+atlas+'.npy')
    mgc = MGC()
    mgc_statistic, independence_test_metadata = mgc.test_statistic(X, Y)
    p_value, metadata = mgc.p_value(X, Y)
    
    print(atlas)
    print("MGC test statistic:", mgc_statistic)
    print("P Value:", p_value)
    results[0,i]=p_value
    i+=1
    
np.save(test+'_results',results)