Exemple #1
1
    def test_mood_3d(self):
        shape = (10, 5, 6)
        np.random.seed(1234)
        x1 = np.random.randn(*shape)
        x2 = np.random.randn(*shape)

        for axis in range(3):
            z_vectest, pval_vectest = stats.mood(x1, x2, axis=axis)
            # Tests that result for 3-D arrays is equal to that for the
            # same calculation on a set of 1-D arrays taken from the
            # 3-D array
            axes_idx = ([1, 2], [0, 2], [0, 1])  # the two axes != axis
            for i in range(shape[axes_idx[axis][0]]):
                for j in range(shape[axes_idx[axis][1]]):
                    if axis == 0:
                        slice1 = x1[:, i, j]
                        slice2 = x2[:, i, j]
                    elif axis == 1:
                        slice1 = x1[i, :, j]
                        slice2 = x2[i, :, j]
                    else:
                        slice1 = x1[i, j, :]
                        slice2 = x2[i, j, :]

                    assert_array_almost_equal([z_vectest[i, j],
                                               pval_vectest[i, j]],
                                              stats.mood(slice1, slice2))
Exemple #2
1
    def test_mood_3d(self):
        shape = (10, 5, 6)
        np.random.seed(1234)
        x1 = np.random.randn(*shape)
        x2 = np.random.randn(*shape)

        for axis in range(3):
            z_vectest, pval_vectest = stats.mood(x1, x2, axis=axis)
            # Tests that result for 3-D arrays is equal to that for the
            # same calculation on a set of 1-D arrays taken from the
            # 3-D array
            axes_idx = ([1, 2], [0, 2], [0, 1])  # the two axes != axis
            for i in range(shape[axes_idx[axis][0]]):
                for j in range(shape[axes_idx[axis][1]]):
                    if axis == 0:
                        slice1 = x1[:, i, j]
                        slice2 = x2[:, i, j]
                    elif axis == 1:
                        slice1 = x1[i, :, j]
                        slice2 = x2[i, :, j]
                    else:
                        slice1 = x1[i, j, :]
                        slice2 = x2[i, j, :]

                    assert_array_almost_equal(
                        [z_vectest[i, j], pval_vectest[i, j]],
                        stats.mood(slice1, slice2))
Exemple #3
0
 def test_mood_order_of_args(self):
     # z should change sign when the order of arguments changes, pvalue
     # should not change
     np.random.seed(1234)
     x1 = np.random.randn(10, 1)
     x2 = np.random.randn(15, 1)
     z1, p1 = stats.mood(x1, x2)
     z2, p2 = stats.mood(x2, x1)
     assert_array_almost_equal([z1, p1], [-z2, p2])
Exemple #4
0
 def test_mood_order_of_args(self):
     # z should change sign when the order of arguments changes, pvalue
     # should not change
     np.random.seed(1234)
     x1 = np.random.randn(10, 1)
     x2 = np.random.randn(15, 1)
     z1, p1 = stats.mood(x1, x2)
     z2, p2 = stats.mood(x2, x1)
     assert_array_almost_equal([z1, p1], [-z2, p2])
Exemple #5
0
    def test_mood_with_axis_none(self):
        #Test with axis = None, compare with results from R
        x1 = [
            -0.626453810742332, 0.183643324222082, -0.835628612410047,
            1.59528080213779, 0.329507771815361, -0.820468384118015,
            0.487429052428485, 0.738324705129217, 0.575781351653492,
            -0.305388387156356, 1.51178116845085, 0.389843236411431,
            -0.621240580541804, -2.2146998871775, 1.12493091814311,
            -0.0449336090152309, -0.0161902630989461, 0.943836210685299,
            0.821221195098089, 0.593901321217509
        ]

        x2 = [
            -0.896914546624981, 0.184849184646742, 1.58784533120882,
            -1.13037567424629, -0.0802517565509893, 0.132420284381094,
            0.707954729271733, -0.23969802417184, 1.98447393665293,
            -0.138787012119665, 0.417650750792556, 0.981752777463662,
            -0.392695355503813, -1.03966897694891, 1.78222896030858,
            -2.31106908460517, 0.878604580921265, 0.035806718015226,
            1.01282869212708, 0.432265154539617, 2.09081920524915,
            -1.19992581964387, 1.58963820029007, 1.95465164222325,
            0.00493777682814261, -2.45170638784613, 0.477237302613617,
            -0.596558168631403, 0.792203270299649, 0.289636710177348
        ]

        x1 = np.array(x1)
        x2 = np.array(x2)
        x1.shape = (10, 2)
        x2.shape = (15, 2)
        assert_array_almost_equal(stats.mood(x1, x2, axis=None),
                                  [-1.31716607555, 0.18778296257])
Exemple #6
0
def nonparametric_check_for_d_similarity(df1, df2, alpha=0.01):
    common_features = set(df1.columns) & set(df2.columns)
    features_stats = []
    for col in common_features:
        # H0=same central parameter
        delta_test, delta_pvalue = stats.mannwhitneyu(df1[col], df2[col])
        if delta_pvalue > alpha:
            delta = 'Same central parameter'
        else:
            delta = 'Different central parameter'
        # H0=equality of the scale parameters
        scale1_test, scale1_pval = stats.ansari(df1[col], df2[col])
        if scale1_pval > alpha:
            scale1 = 'Same scale AnsariTest'
        else:
            scale1 = 'Different scale AnsariTest'
        # H0=equality of the scale parameters
        scale2_test, scale2_pval = stats.mood(df1[col], df2[col])
        if scale2_pval > alpha:
            scale2 = 'Same scale MoodTest'
        else:
            scale2 = 'Different scale MoodTest'
        features_stats.append([col, delta_pvalue, delta, scale1_pval, scale1, scale2_pval, scale2])
    features_stats = pd.DataFrame(features_stats)
    features_stats.columns = ['col_name', 'delta_pval', 'delta_status', \
                              'scale1_pval', 'scale1_status', 'scale2_pval', 'scale2_status']
    return features_stats
Exemple #7
0
    def test_mood_with_axis_none(self):
        #Test with axis = None, compare with results from R
        x1 = [-0.626453810742332, 0.183643324222082, -0.835628612410047,
               1.59528080213779, 0.329507771815361, -0.820468384118015,
               0.487429052428485, 0.738324705129217, 0.575781351653492,
              -0.305388387156356, 1.51178116845085, 0.389843236411431,
              -0.621240580541804, -2.2146998871775, 1.12493091814311,
              -0.0449336090152309, -0.0161902630989461, 0.943836210685299,
               0.821221195098089, 0.593901321217509]

        x2 = [-0.896914546624981, 0.184849184646742, 1.58784533120882,
              -1.13037567424629, -0.0802517565509893, 0.132420284381094,
               0.707954729271733, -0.23969802417184, 1.98447393665293,
              -0.138787012119665, 0.417650750792556, 0.981752777463662,
              -0.392695355503813, -1.03966897694891, 1.78222896030858,
              -2.31106908460517, 0.878604580921265, 0.035806718015226,
               1.01282869212708, 0.432265154539617, 2.09081920524915,
              -1.19992581964387, 1.58963820029007, 1.95465164222325,
               0.00493777682814261, -2.45170638784613, 0.477237302613617,
              -0.596558168631403, 0.792203270299649, 0.289636710177348]

        x1 = np.array(x1)
        x2 = np.array(x2)
        x1.shape = (10, 2)
        x2.shape = (15, 2)
        assert_array_almost_equal(stats.mood(x1, x2, axis=None),
                                  [-1.31716607555, 0.18778296257])
    def test_mood_2d(self):
        # Test if the results of mood test in 2-D case are consistent with the
        # R result for the same inputs.  Numbers from R mood.test().
        ny = 5
        np.random.seed(1234)
        x1 = np.random.randn(10, ny)
        x2 = np.random.randn(15, ny)
        z_vectest, pval_vectest = stats.mood(x1, x2)

        for j in range(ny):
            assert_array_almost_equal([z_vectest[j], pval_vectest[j]], stats.mood(x1[:, j], x2[:, j]))

        # inverse order of dimensions
        x1 = x1.transpose()
        x2 = x2.transpose()
        z_vectest, pval_vectest = stats.mood(x1, x2, axis=1)

        for i in range(ny):
            # check axis handling is self consistent
            assert_array_almost_equal([z_vectest[i], pval_vectest[i]], stats.mood(x1[i, :], x2[i, :]))
Exemple #9
0
    def test_mood_2d(self):
        # Test if the results of mood test in 2-D case are consistent with the
        # R result for the same inputs.  Numbers from R mood.test().
        ny = 5
        np.random.seed(1234)
        x1 = np.random.randn(10, ny)
        x2 = np.random.randn(15, ny)
        z_vectest, pval_vectest = stats.mood(x1, x2)

        for j in range(ny):
            assert_array_almost_equal([z_vectest[j], pval_vectest[j]],
                                      stats.mood(x1[:, j], x2[:, j]))

        # inverse order of dimensions
        x1 = x1.transpose()
        x2 = x2.transpose()
        z_vectest, pval_vectest = stats.mood(x1, x2, axis=1)

        for i in range(ny):
            # check axis handling is self consistent
            assert_array_almost_equal([z_vectest[i], pval_vectest[i]],
                                      stats.mood(x1[i, :], x2[i, :]))
Exemple #10
0
def test_significance_tests(normal_obs, normal_obs_control):
    treatment = ab.sample(normal_obs)
    control = ab.sample(normal_obs_control)
    res = treatment.t_test(control, equal_var=True)
    res_expected = ttest_ind(normal_obs, normal_obs_control, equal_var=True)
    assert res.p_value == res_expected.pvalue
    assert res.statistic == res_expected.statistic

    res = treatment.t_test(control, equal_var=False)
    res_expected = ttest_ind(normal_obs, normal_obs_control, equal_var=False)
    assert res.p_value == res_expected.pvalue
    assert res.statistic == res_expected.statistic

    res = treatment.t_test_1samp(101)
    res_expected = ttest_1samp(normal_obs, 101)
    assert res.p_value == res_expected.pvalue
    assert res.statistic == res_expected.statistic

    res = treatment.mann_whitney_u_test(control)
    res_expected = mannwhitneyu(normal_obs_control, normal_obs, alternative='two-sided')
    assert res.p_value == pytest.approx(res_expected.pvalue, 1e-6)
    assert res.u_statistic == res_expected.statistic

    res = treatment.shapiro_test()
    res_expected = shapiro(normal_obs)
    assert res.statistic == res_expected[0]
    assert res.p_value == res_expected[1]

    res = treatment.median_test(control)
    res_expected = median_test(normal_obs, normal_obs_control)
    assert res.p_value == res_expected[1]
    assert res.statistic == res_expected[0]
    assert res.grand_median == res_expected[2]

    res = treatment.levene_test(control)
    res_expected = levene(normal_obs, normal_obs_control)
    assert res.p_value == res_expected.pvalue
    assert res.statistic == res_expected.statistic

    res = treatment.mood_test(control)
    res_expected = mood(normal_obs, normal_obs_control)
    assert res.p_value == res_expected[1]
    assert res.statistic == res_expected[0]
Exemple #11
0
def vector_hypotheses(a, b):

    dict_stat = {}
    dict_pval = {}
    pea = pearsonr(a, b)
    dict_stat["pearsonr"], dict_pval["pearsonr"] = pea[0], pea[1]
    ran = ranksums(a, b)
    dict_stat["ranksums"], dict_pval["ranksums"] = ran[0], ran[1]
    moo = mood(a, b)
    dict_stat["mood"], dict_pval["mood"] = moo[0], moo[1]
    fli = fligner(a, b)
    dict_stat["fligner"], dict_pval["fligner"] = fli[0], fli[1]
    ans = ansari(a, b)
    dict_stat["ansari"], dict_pval["ansari"] = ans[0], ans[1]
    bar = bartlett(a, b)
    dict_stat["bartlett"], dict_pval["bartlett"] = bar[0], bar[1]
    lev = levene(a, b)
    dict_stat["levene"], dict_pval["levene"] = lev[0], lev[1]
    man = mannwhitneyu(a, b)
    dict_stat["mannwhitneyu"], dict_pval["mannwhitneyu"] = man[0], man[1]
    return dict_stat, dict_pval
Exemple #12
0
def stream(data, Test, thresholds):
    # start monitoring at time = 20
    i = 20
    while (i < len(data) + 1):
        # increment window to be tested
        window = data[:i]
        arg, D = Test(window)
        # test at ith threshold value or at the last loaded threshold value
        if i >len(thresholds):
            cp = changedetected(D, thresholds[-1])
        else:
            cp = changedetected(D, thresholds[i-1])
        if cp == True:
            change = ''
            if Test == Lepage:
                # If test is Lepage, test to see what change was detected (either location or shit)
                # If the p value from eg mann whitney test is 100 times smaller than that from the mood test, we declare a location shift
                # If this is not met, then further investigation is needed (tested on synthetic data and the value of 100 seemed appropiate)
                p_val_Mood = mood(window[:arg + 1], window[arg + 1:])[1]
                p_val_MW = mannwhitneyu(window[:arg + 1], window[arg + 1:])[1]
                if 0.01 * p_val_Mood > p_val_MW:
                    change = "Location"
                elif 0.01 * p_val_MW > p_val_Mood:
                    change = "Scale"
                else:
                    change = 'Needs further investigation'
            else: 
                pass

            # return the location of the cp, index for which is was detected, True (cp was declared), what type of change
            return arg + 1, i - 1, True, change
        else:
            # if no change, increase the window size by 1
            i += 1
    # return this if no cp is found and no more datapoints to process
    return None, None, False, None
 def __compute_mood_statistic_ts(self, ts):
     statistics = [
         stat.mood(ts[t + 1:], ts[:t + 1])[0]
         for t in np.arange(len(ts) - 1)
     ]
     return pd.Series(statistics).dropna().reset_index(drop=True)
def mood((x, y)):
    return stats.mood(x, y)
Exemple #15
0
def mood_test(obs, obs_control) -> MoodTestResult:
    """
    :return: MoodTestResult(statistic, p_value)
    """
    res = stats.mood(obs, obs_control)
    return MoodTestResult(statistic=res[0], p_value=res[1])
Exemple #16
0
 def test_mood(self):
     # numbers from R: mood.test in package stats
     x1 = np.arange(5)
     assert_array_almost_equal(stats.mood(x1, x1**2),
                               (-1.3830857299399906, 0.16663858066771478), 11)
Exemple #17
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import ttest_ind
import statsmodels.stats.api as sms
GE = pd.read_csv('C:/Users/anivia/Desktop/geDJ.txt',
                 sep="\s+",
                 header=None,
                 names=['date', 'open', 'high', 'low', 'close', 'vol'])
SP = pd.read_csv(
    'https://www.math.ust.hk/~macwyu/MAFS5110_2018-2019/MAFS5110_2018-2019/Chapter_1/sp500.txt',
    sep="\s+")
logreturn_GE = np.diff(np.log(np.array(GE["close"])))
logreturn_sp500 = np.diff(np.log(np.array(SP["close"])))
da2 = pd.concat([pd.DataFrame(logreturn_GE),
                 pd.DataFrame(logreturn_sp500)],
                axis=1)
#da2.columns=['date','open','high','low','close','vol','logreturn_sp500']
#da2.index=da.index[1:]
da2.columns = ["logreturn_GE", "logreturn_sp500"]
da2.boxplot(column=['logreturn_GE', 'logreturn_sp500'])
plt.show()
print(stats.mood(logreturn_sp500, logreturn_GE))
print('H0 can be rejected, the variances are significantly different')
print(ttest_ind(logreturn_sp500, logreturn_GE, equal_var=True))
print('')
cm = sms.CompareMeans(sms.DescrStatsW(logreturn_sp500),
                      sms.DescrStatsW(logreturn_GE))
print(cm.tconfint_diff())
Exemple #18
0
def main(graph_name):
  
    G = nx.read_gml(graph_name)
    G = nx.connected_component_subgraphs(G)[0] # Giant component 
  
   
   


    list_one_friend_percent_weight_change=[]    
    list_all=[]
    for node in G.nodes():
        if G.node[node]['label'] == "40155" :
            special=node           
            for n in G.neighbors(node):                
                if G.node[n]['time_in_system'] >100:
                    list_all.append(float(G.node[n]['percentage_weight_change']))
                    print len(G.neighbors(n))
                    if len(G.neighbors(n))==1:
                        list_one_friend_percent_weight_change.append(float(G.node[n]['percentage_weight_change']))
           
            break

  
    print "final number of friendless nodes:", len(list_one_friend_percent_weight_change), "av_percent_wc:",numpy.mean(list_one_friend_percent_weight_change),numpy.std(list_one_friend_percent_weight_change)
    print "av_%_wc fo all",len(list_all), "neighbors of 40155:", numpy.mean(list_all), numpy.std(list_all) 


  



    #dir=graph_name.split("fr")[0]
    dir=graph_name.split("mas")[0]
   
   
    original_name=graph_name.split("data/")[1]
    original_name=original_name.split(".gml")[0]

    dir=dir+"roles/"+original_name
            
 
    time_in_system=100 #minimum amount of time in the sytem for a user to be included in the statistics

    iterations=1000  #for boostrap

   
    name00=dir+"bootstrap_statistics_percent_weight_change_one_hops_R6s"+str(time_in_system)+"days_exclude_R6s.dat"
  

    list_R6s=[]     # collect the R6 of the system   
    list_R6s_labels=[] 
    for node in G.nodes() :    
        if str(G.node[node]['role']) == "R6" :
          list_R6s.append(node)
          list_R6s_labels.append(str(G.node[node]['label']))
    


    list_percentage_wc_one_hop=[]    
    list_one_hoppers=[]    
    for node in G.nodes():       
        if (str(G.node[node]['role']) == "R6" ):
            
            for n in  G.neighbors(node):
                if (str(G.node[n]['role']) != "R6" ):
                    if  int(G.node[n]['time_in_system']) > time_in_system :
                        if n not in list_one_hoppers:
                            list_percentage_wc_one_hop.append(float(G.node[n]['percentage_weight_change']))  

                            list_one_hoppers.append(n)
                        
  


    print "all one-hops:",numpy.mean(list_percentage_wc_one_hop),len(list_percentage_wc_one_hop)
           
   
    file0=open(name00, 'wt')
    print >> file0,"Percentage Weight Change for one-hop-from-R6s\n\n\n","original:",numpy.mean(list_percentage_wc_one_hop),"   set size:",len(list_percentage_wc_one_hop)           
    file0.close()
       

    
# R6s en friend_graph_all.gml: 40155, 28688, 45784, 41794, 43020, 47063, 39625, 31954, 40324,40666


#R6s en master_adherent_homo.gml: 40155, 41794, 39625, 46487, 31954, 40324, 28688, 45784, 40666


    for excluding in list_R6s_labels:


        list_percentage_wc_one_hop_excluding_oneR6=[]    
        list_one_hoppers_excluding_oneR6=[]    
        for node in G.nodes():       
            if (str(G.node[node]['role']) == "R6" ) and (str(G.node[node]['label']) != excluding ): 
                for n in  G.neighbors(node):
                    if (str(G.node[n]['role']) != "R6" ):
                        if  int(G.node[n]['time_in_system']) > time_in_system :
                            if n not in list_one_hoppers_excluding_oneR6:                          
                                list_percentage_wc_one_hop_excluding_oneR6.append(float(G.node[n]['percentage_weight_change']))  
                                
                                list_one_hoppers_excluding_oneR6.append(n)
                                  


        actual_diff=numpy.mean(list_percentage_wc_one_hop)-numpy.mean(list_percentage_wc_one_hop_excluding_oneR6)

        print "\n\ndiff. all one-hops & all but one hub",excluding,":",actual_diff



        file0=open(name00, 'at')
        print >> file0,"    excluding",excluding, ":",numpy.mean(list_percentage_wc_one_hop_excluding_oneR6), " (diff:" , actual_diff,")   set size:",len(list_percentage_wc_one_hop_excluding_oneR6)         
        file0.close()





      ###############################################
      # boostrap routine, sampling with replacement:#
      ###############################################
        list_all=[]
        for i in list_percentage_wc_one_hop:
            list_all.append(i)
        for i in list_percentage_wc_one_hop_excluding_oneR6:
            list_all.append(i)


        list_synth_diff=[]


        for iter in range(iterations):
            list_synth_one_hop=sample_with_replacement(list_all,len(list_percentage_wc_one_hop))
            list_synth_one_hop_excluding_oneR6s=sample_with_replacement(list_all,len(list_percentage_wc_one_hop_excluding_oneR6))
                        

            mean1=numpy.mean(list_synth_one_hop)
            mean2=numpy.mean(list_synth_one_hop_excluding_oneR6s)

            list_synth_diff.append(mean1-mean2)
       

        zscore=(actual_diff-numpy.mean(list_synth_diff))/numpy.std(list_synth_diff)

        print "mean_over_synth_realizations (with repl.):",numpy.mean(list_synth_diff),"zscore:",zscore
        

        file0=open(name00, 'at')
        print >> file0,"       z-score (sampling with replacement)",zscore
        file0.close()
        

      # boostrap routine, sampling without replacement:

        list_synth_diff=[]


        for iter in range(iterations):
            list_synth_one_hop=random.sample(list_all,len(list_percentage_wc_one_hop))

            for i in list_all:
                if i not in list_synth_one_hop:
                    list_synth_one_hop_excluding_oneR6s.append(i)
                                   

            mean1=numpy.mean(list_synth_one_hop)
            mean2=numpy.mean(list_synth_one_hop_excluding_oneR6s)

            list_synth_diff.append(mean1-mean2)
       

        zscore=(actual_diff-numpy.mean(list_synth_diff))/numpy.std(list_synth_diff)

        print "mean_over_synth_realizations (without repl.):",numpy.mean(list_synth_diff),"zscore:",zscore

       

       

        # mood test
        mood=stats.mood(list_percentage_wc_one_hop,list_percentage_wc_one_hop_excluding_oneR6) 
        print "mood test:",mood
        
        #t test:
        ttest=stats.ttest_ind(list_percentage_wc_one_hop,list_percentage_wc_one_hop_excluding_oneR6, axis=0)
        print "t-test:",ttest


       #wilcoxon test  SET SIZES MUST BE THE SAME
       # wilcoxon=stats.wilcoxon(list_percentage_wc_one_hop, list_percentage_wc_one_hop_excluding_oneR6)
        #print "wilcoxon test:",wilcoxon
     


        file0=open(name00, 'at')
        print >> file0,"       z-score (sampling without replacement)",zscore,"\n       mood-test:",mood#,"\n       wilcoxon-test:",wilcoxon,"\n"
        file0.close()



    file0=open(name00, 'at')
    print >> file0,"\n\n(number iterations for the bootstrap:",iterations,")"
    file0.close()
def mood((x, y)):
    return stats.mood(x, y)
def main(graph_name):

    H = nx.read_gml(graph_name)

    for node in H.nodes():  # i remove self loops
        if node in H.neighbors(node):
            if len(H.neighbors(node)) > 1:
                H.remove_edge(node, node)
            else:
                H.remove_node(node)

# for node in H.nodes():
#    if H.node[node]['weigh_ins'] <5: #Adherent filter
#       H.remove_node(node)
# print node, "is going down"

    G = nx.connected_component_subgraphs(H)[0]  # Giant component

    print "size of the GC:", len(
        G.nodes())  #, "after filtering for adherence!!"

    #dir=graph_name.split("full_")[0]
    #dir=graph_name.split("master")[0]
    #dir=graph_name.split("method3_")[0]
    #dir=graph_name.split("method3_adh")[0]
    dir = graph_name.split("friends")[0]

    dir = dir + "roles/"

    time_in_system = 50  #minimum amount of time in the sytem for a user to be included in the statistics

    #name=graph_name.split('data/')[1]
    #name=graph_name.split('method3_50/interim/')[1]
    #name=graph_name.split('network_all_users/')[1]
    name = graph_name.split('5_points_network_2010/data/')[1]

    name = name.split('.gml')[0]

    name0 = dir + name + "_overlap_R6s_averages_" + str(
        time_in_system) + "days_exclude_R6s.dat"
    file0 = open(name0, 'wt')
    file0.close()

    contador = 0
    name12 = dir + name + "_slopes_for_the_fits_average_weight_change.dat"
    file = open(name12, 'wt')
    file.close()

    ####for the Isolated Clusters:
    list_GC_nodes = []
    for n in G.nodes():
        list_GC_nodes.append(n)
    # print G.node[n]['percentage_weight_change']

# print "# users GC:",len(list_GC_nodes),"total:",len(H.nodes())

    list_weight_changes_not_GC = []
    for n in H.nodes():
        if n not in list_GC_nodes:
            #print n,"not in GC"
            list_weight_changes_not_GC.append(
                float(H.node[n]['percentage_weight_change']))

    #print  "# users not in GC:",len(list_weight_changes_not_GC)

# who="not_GC"
#Nbins=18
#histograma(list_weight_changes_not_GC,Nbins,dir,name,who)

###########################

    list_R6s = []  # collect the R6 of the system
    list_R6s_label = []
    list_R6s_percent_weight_change = []
    for node in G.nodes():
        if str(G.node[node]['role']) == "R6":
            list_R6s.append(node)
            list_R6s_label.append(G.node[node]['label'])
            list_R6s_percent_weight_change.append(
                float(G.node[node]['percentage_weight_change']))

    name00 = dir + name + "R6s_and_top_tens_averages_" + str(
        time_in_system) + "days_exclude_R6s.dat"

    file0 = open(name00, 'at')
    print >> file0, "R6s", numpy.mean(
        list_R6s_percent_weight_change), numpy.std(
            list_R6s_percent_weight_change)
    file0.close()

    #  print "\n\n R6s:\n"
    # for i in  list_R6s_label:
    #    print i

    # studying the possible cumulative effect of more than one R6 on the population:
    for node in G.nodes():
        cont = 0
        for n in G.neighbors(node):
            if str(G.node[n]['role']) == "R6":
                cont += 1

        G.node[node]["R6_overlap"] = int(cont)

    ##### weight change for people not connected to any R6s:####

    list_weight_changes_no_neighbors = []
    for node in G.nodes():
        interseccion = list(set(G.neighbors(node)) & set(list_R6s))

        # print node, "intersection:",intersection,len(intersection)
        #   print "because", list_R6s, "and ",G.neighbors(node)
        #  raw_input()
        if len(interseccion) == 0:
            list_weight_changes_no_neighbors.append(
                G.node[node]['percentage_weight_change'])

#  print len(list_weight_changes_no_neighbors),"no_neighbors"

    who = "no_neigbors_R6s"
    Nbins = 18
    histograma(list_weight_changes_no_neighbors, Nbins, dir, name, who)

    # mood test
    mood = stats.mood(list_weight_changes_no_neighbors,
                      list_weight_changes_not_GC)
    print "mood test for", who, "against not_GC:", mood

    ########
    # K-S test:
    ks = stats.ks_2samp(list_weight_changes_no_neighbors,
                        list_weight_changes_not_GC)
    print "KS test for", who, "against not_GC:", ks

    name00 = "ks_results.dat"

    file0 = open(dir + name00, 'at')
    print >> file0, "KS test for", who, "of", graph_name, "against not_GC:", ks
    file0.close()
    #############################################

    #average percentage weight change as a function of the size of the largest CLIQUE the node belongs to:

    absolute_max = 1
    for i in G.nodes():

        maximo = 1
        list2 = nx.cliques_containing_node(G, i)
        # print i, list2

        for elem in list2:
            # print elem,len(elem,)
            if len(elem) > maximo:
                maximo = len(elem)
    # print "\n",maximo
        G.node[i]['max_clique_size'] = maximo

        if absolute_max < maximo:
            absolute_max = maximo

    #print absolute_max

    lista = list(
        nx.find_cliques(G))  # crea una lista de cliques (lista de listas)
    max_clique = nx.graph_clique_number(G)  #finds out max size clique
    num_tot_clique = nx.graph_number_of_cliques(
        G)  #finds out total number of cliques

    # count number of 2, 3, 4, 5, 6  and 7cliques:

    num_2cliques = 0
    num_3cliques = 0
    num_4cliques = 0
    num_5cliques = 0
    num_6cliques = 0
    num_7cliques = 0
    num_8cliques = 0
    num_9cliques = 0

    for element in lista:
        if len(element) == 2:
            num_2cliques = num_2cliques + 1

        elif len(element) == 3:
            num_3cliques = num_3cliques + 1

        elif len(element) == 4:
            num_4cliques = num_4cliques + 1

        elif len(element) == 5:
            num_5cliques = num_5cliques + 1

        elif len(element) == 6:
            num_6cliques = num_6cliques + 1

        elif len(element) == 7:
            num_7cliques = num_7cliques + 1

        elif len(element) == 8:
            num_8cliques = num_8cliques + 1

        elif len(element) == 9:
            num_9cliques = num_9cliques + 1

#   print " 2: ",num_2cliques, "     3: ",num_3cliques, "   4: ",num_4cliques, "     5: ",num_5cliques, "   6: ",num_6cliques, "   7: ",num_7cliques, "   8: ",num_8cliques, "   9: ",num_9cliques, "   max_clique_size:",max_clique, "   num_tot_cliques:", num_tot_clique

    name33 = dir + name + "_percent_weight_change_vs_largest_clique_size.dat"
    file11 = open(name33, 'wt')
    file11.close()

    list_of_lists_for_bootstrap = []

    x_positions_fit = []
    y_positions_fit = []
    cum_size_set = float(len(G.nodes()))

    tot_nodes = []

    for clique_size in range(1, max_clique):

        clique_size = clique_size + 1
        print clique_size

        num_users_set = cum_size_set

        percent_weight_change_that_clique_size = []
        for n in G.nodes():

            if G.node[n]['max_clique_size'] == clique_size:
                percent_weight_change_that_clique_size.append(
                    float(G.node[n]['percentage_weight_change']))

                tot_nodes.append(float(G.node[n]['percentage_weight_change']))

                cum_size_set -= 1.0

        file11 = open(name33, 'at')
        print >> file11, clique_size, len(
            percent_weight_change_that_clique_size), num_users_set / float(
                len(G.nodes())), numpy.mean(
                    percent_weight_change_that_clique_size), numpy.std(
                        percent_weight_change_that_clique_size)
        file11.close()

        if len(x_positions_fit) <= 7:
            x_positions_fit.append(clique_size)
            y_positions_fit.append(
                numpy.mean(percent_weight_change_that_clique_size))

            list_of_lists_for_bootstrap.append(
                percent_weight_change_that_clique_size)

    slope, intercept, Corr_coef, p_value, std_err = stats.linregress(
        x_positions_fit, y_positions_fit)  # least squeares polinomial fit

    print "result linear. fit for clique size dependency:"

    print "slope:", slope, "intercept:", intercept, "Corr_coef:", Corr_coef, "p_value:", p_value, "std_err:", std_err

    name11 = dir + name + "_fits_clique_size.dat"

    file11 = open(name11, 'wt')
    for i in range(len(x_positions_fit)):
        print >> file11, x_positions_fit[
            i], intercept + x_positions_fit[i] * slope

    print >> file11, "\n\n", "y=", intercept, "+", slope, "*x",
    print "Bootstrap for clique size:\n"

    mean_slope, standard_dev = bootstrap(x_positions_fit[0],
                                         x_positions_fit[-1],
                                         list_of_lists_for_bootstrap)
    zscore = (slope - mean_slope) / standard_dev

    print >> file11, "bootstrap:\n", "actual slope:", slope, "mean_slope:", mean_slope, "standard_dev:", standard_dev, "\n zscore:", zscore

    print x_positions_fit[0], x_positions_fit[
        -1], "actual slope:", slope, "mean_slope:", mean_slope, "standard_dev:", standard_dev, "\n zscore:", zscore

    file11.close()

    contador += 1
    file = open(name12, 'at')
    print >> file, contador, mean_slope, standard_dev, "largest_clique_size"
    file.close()

    #######################################

    #####dose effect of the R6s independently########

    name11 = dir + name + "_dose_eff_indepently_only_one_R6_" + str(
        time_in_system) + "days_exclude_R6s.dat"
    file11 = open(name11, 'at')
    print >> file11, 0, "average_no_neighbors", "average_no_neighbors", "average_no_neighbors", len(
        list_weight_changes_no_neighbors
    ), numpy.mean(list_weight_changes_no_neighbors), numpy.std(
        list_weight_changes_no_neighbors
    )  # the first line of the file is actually for no_neighbors, the rest, for one_and_only_one
    file11.close()

    file11 = open(name11, 'wt')
    file11.close()

    cont = 1
    list_all = []
    list_all_nodes = []
    for R6 in list_R6s:
        list_weight_changes = []
        for n in G.neighbors(R6):
            if (G.node[n]['role'] != "R6") and (G.node[n]["R6_overlap"] == 1):
                list_weight_changes.append(
                    float(G.node[n]['percentage_weight_change']))

                if n not in list_all_nodes:
                    list_all_nodes.append(n)
                    list_all.append(
                        float(G.node[n]['percentage_weight_change']))

        if len(list_weight_changes) > 0:

            file11 = open(name11, 'at')
            print >> file11, cont, G.node[R6]['role'], G.node[R6][
                'label'], len(
                    G.neighbors(R6)), len(list_weight_changes), numpy.mean(
                        list_weight_changes), numpy.std(list_weight_changes)
            file11.close()
            # print cont,G.node[R6]['role'],G.node[R6]['label'], len(G.neighbors(R6)),len(list_weight_changes),numpy.mean(list_weight_changes),numpy.std(list_weight_changes)
            cont = cont + 1

        else:
            # file11=open(name11, 'at')
            #print >> file11,cont,G.node[R6]['role'],G.node[R6]['label'],len(G.neighbors(R6)),len(list_weight_changes)
            #file11.close()
            # print cont,G.node[R6]['role'],G.node[R6]['label'],len(G.neighbors(R6)),len(list_weight_changes)
            cont = cont + 1

    who = "one_and_only_one_R6s"
    Nbins = 18
    histograma(list_all, Nbins, dir, name, who)

    ####################################

    print "\n\n"

    list_of_lists_for_bootstrap = []

    x_positions_fit = []
    y_positions_fit = []

    averages_larger5_x = []
    averages_larger5_y = []
    norm = 0.0

    cum_size_set = float(len(G.nodes())) - float(len(list_R6s))
    for r in range(len(list_R6s) + 1):

        # list_BMI_changes=[]
        list_weight_changes = []
        list_percentage_weight_changes = []
        list_activities = []

        num_users_set = cum_size_set
        for node in G.nodes():

            if int(G.node[node]["R6_overlap"]) == r:

                if G.node[node]["role"] == "R6":  # i exclude the R6s

                    pass
                else:

                    if int(G.node[node]['time_in_system']) > time_in_system:

                        #   list_BMI_changes.append(float(G.node[node]['final_BMI'])-float(G.node[node]['initial_BMI']))
                        list_weight_changes.append(
                            float(G.node[node]['weight_change']))
                        list_percentage_weight_changes.append(
                            float(G.node[node]['percentage_weight_change']))
                        list_activities.append(
                            float(G.node[node]['activity']) /
                            float(G.node[node]['time_in_system']))
                        cum_size_set -= 1.0

        if len(list_percentage_weight_changes) > 0:
            # average_BMI_change=numpy.mean(list_BMI_changes)
            average_weight_change = numpy.mean(list_weight_changes)
            average_percentage_weight_change = numpy.mean(
                list_percentage_weight_changes)
            average_activity = numpy.mean(list_activities)

            #deviation_BMI=numpy.std(list_BMI_changes)
            deviation_weight = numpy.std(list_weight_changes)
            deviation_percentage_weight = numpy.std(
                list_percentage_weight_changes)
            deviation_activity = numpy.std(list_activities)

            #print out

            file0 = open(name0, 'at')
            print >> file0, r, len(
                list_percentage_weight_changes
            ), num_users_set / float(
                len(G.nodes())
            ), average_percentage_weight_change, deviation_percentage_weight, average_weight_change, deviation_weight, average_activity, deviation_activity
            file0.close()

            if r <= 5:
                x_positions_fit.append(r)
                y_positions_fit.append(average_percentage_weight_change)

                list_of_lists_for_bootstrap.append(
                    list_percentage_weight_changes)

        # else:
        #    aux_x=r*len(list_percentage_weight_changes)
        #   averages_larger5_x.append(aux_x)

        #  aux_y=average_percentage_weight_change*len(list_percentage_weight_changes)
        # averages_larger5_y.append(aux_y)
        #norm+=float(len(list_percentage_weight_changes))

#    x_positions_fit.append(numpy.mean(averages_larger5_x)/norm)
#   y_positions_fit.append(numpy.mean(averages_larger5_y)/norm)

    slope, intercept, Corr_coef, p_value, std_err = stats.linregress(
        x_positions_fit, y_positions_fit)  # least squeares polinomial fit

    print "result linear. fit for dose eff.:"
    print "slope:", slope, "intercept:", intercept, "Corr_coef:", Corr_coef, "p_value:", p_value, "std_err:", std_err

    name11 = dir + name + "_fits_dose_eff_R6.dat"

    file11 = open(name11, 'wt')
    for i in range(len(x_positions_fit)):
        print >> file11, x_positions_fit[
            i], intercept + x_positions_fit[i] * slope

    print >> file11, "\n\n", "y=", intercept, "+", slope, "*x",
    print "Bootstrap for dose eff. R6s:\n"

    mean_slope, standard_dev = bootstrap(x_positions_fit[0],
                                         x_positions_fit[-1],
                                         list_of_lists_for_bootstrap)
    zscore = (slope - mean_slope) / standard_dev

    print >> file11, "bootstrap:\n", "actual slope:", slope, "mean_slope:", mean_slope, "standard_dev:", standard_dev, "\n zscore:", zscore

    print x_positions_fit[0], x_positions_fit[
        -1], "actual slope:", slope, "mean_slope:", mean_slope, "standard_dev:", standard_dev, "\n zscore:", zscore

    file11.close()

    contador += 1
    file = open(name12, 'at')
    print >> file, contador, mean_slope, standard_dev, "dose_eff"
    file.close()

    #### averages for every R6's egonetwork:#########
    cont = 1
    list_all_ = []
    list_all_nodes_ = []
    for node in list_R6s:
        neighbors = G.neighbors(node)  #a list of nodes

        average_BMI_change = 0.0
        list_BMI_changes = []

        average_weight_change = 0.0
        list_weight_changes = []

        average_percentage_weight_change = 0.0
        list_percentage_weight_changes = []

        average_activity = 0.0  # ojo! sera dividida por el numero de dias!!!!!
        list_activities = []

        for n in G.neighbors(node):

            if int(G.node[n]['time_in_system']) > time_in_system:

                # list_BMI_changes.append(float(G.node[n]['final_BMI'])-float(G.node[n]['initial_BMI']))

                list_weight_changes.append(float(G.node[n]['weight_change']))

                list_percentage_weight_changes.append(
                    float(G.node[n]['percentage_weight_change']))

                list_activities.append(
                    float(G.node[n]['activity']) /
                    float(G.node[n]['time_in_system']))

                if n not in list_all_nodes_:
                    list_all_nodes_.append(n)
                    list_all_.append(
                        float(G.node[n]['percentage_weight_change']))

#averages
        average_weight_change = numpy.mean(list_weight_changes)
        #  average_BMI_change=numpy.mean(list_BMI_changes)
        average_activity = numpy.mean(list_activities)
        average_percentage_weight_change = numpy.mean(
            list_percentage_weight_changes)

        #standard deviation
        #deviation_BMI=numpy.std(list_BMI_changes)
        deviation_weight = numpy.std(list_weight_changes)
        deviation_percentage_weight = numpy.std(list_percentage_weight_changes)
        deviation_activity = numpy.std(list_activities)

        #print out
        name2 = dir + name + "_ego_R6s_average_weight_change_" + str(
            time_in_system) + "days.dat"
        file2 = open(name2, 'at')
        print >> file2, cont, G.node[node]['role'], G.node[node]['label'], len(
            G.neighbors(node)), average_weight_change, deviation_weight
        file2.close()

        name22 = dir + name + "_ego_R6s_average_percentage_weight_change_" + str(
            time_in_system) + "days.dat"
        file22 = open(name22, 'at')
        print >> file22, cont, G.node[node]['role'], G.node[node][
            'label'], len(
                G.neighbors(node)
            ), average_percentage_weight_change, deviation_percentage_weight
        file22.close()

        name3 = dir + name + "_ego_R6s_average_activity_" + str(
            time_in_system) + "days.dat"
        file3 = open(name3, 'at')
        print >> file3, cont, G.node[node]['role'], G.node[node]['label'], len(
            G.neighbors(node)), average_activity, deviation_activity
        file3.close()

        cont = cont + 1

    who = "R6s_egonetworks_all"
    Nbins = 18
    histograma(list_all_, Nbins, dir, name, who)

    #  print "intersection:",len(set(list_all_)&set(list_all)),len(list_all_),len(list_all)
    #############just checking what happens if we remove the 40155 guy

    ##### percent weight change vs. role:

    list_roles = ["R1", "R2", "R3", "R4", "R5", "R6", "R7"]

    file = open(dir + name + "_percentage_weight_change_vs_role", 'wt')
    cont = 1
    for role in list_roles:

        list_weight_changes_role = []
        for n in G.nodes():
            if G.node[n]['role'] == role:
                list_weight_changes_role.append(
                    G.node[n]['percentage_weight_change'])

        print >> file, cont, role, len(list_weight_changes_role), numpy.mean(
            list_weight_changes_role), numpy.std(list_weight_changes_role)

        cont += 1

    file.close()

    #############################

    ############## percentage weight change vs k
    x_positions_fit = []
    y_positions_fit = []

    cum_size_set = float(len(G.nodes()))

    list_of_lists_for_bootstrap = []

    list_k = []
    for n in G.nodes():
        list_k.append(len(G.neighbors(n)))

    max_k = max(list_k)

    file = open(dir + name + "_percentage_weight_change_vs_k.dat", 'wt')
    max_k = max_k + 1
    for k in range(1, max_k):

        num_users_set = cum_size_set

        list_percent_weight_change_k = []
        for n in G.nodes():
            if len(G.neighbors(n)) == k:
                list_percent_weight_change_k.append(
                    G.node[n]['percentage_weight_change'])
                cum_size_set -= 1.0

        if len(list_percent_weight_change_k) > 0:
            print >> file, k, len(
                list_percent_weight_change_k), num_users_set / float(
                    len(G.nodes())), numpy.mean(
                        list_percent_weight_change_k), numpy.std(
                            list_percent_weight_change_k)

            if len(x_positions_fit) <= 7:
                x_positions_fit.append(k)
                y_positions_fit.append(
                    numpy.mean(list_percent_weight_change_k))

                list_of_lists_for_bootstrap.append(
                    list_percent_weight_change_k)

    slope, intercept, Corr_coef, p_value, std_err = stats.linregress(
        x_positions_fit, y_positions_fit)  # least squeares polinomial fit

    print "result linear. fit for degree dependency:"
    print "slope:", slope, "intercept:", intercept, "Corr_coef:", Corr_coef, "p_value:", p_value, "std_err:", std_err

    file.close()

    name11 = dir + name + "_fits_degree.dat"

    file11 = open(name11, 'wt')
    for i in range(len(x_positions_fit)):
        print >> file11, x_positions_fit[
            i], intercept + x_positions_fit[i] * slope

    print >> file11, "\n\n", "y=", intercept, "+", slope, "*x",

    print "Bootstrap for degree:\n"

    mean_slope, standard_dev = bootstrap(x_positions_fit[0],
                                         x_positions_fit[-1],
                                         list_of_lists_for_bootstrap)
    zscore = (slope - mean_slope) / standard_dev

    print >> file11, "bootstrap:\n", "actual slope:", slope, "mean_slope:", mean_slope, "standard_dev:", standard_dev, "\n zscore:", zscore

    print x_positions_fit[0], x_positions_fit[
        -1], "actual slope:", slope, "mean_slope:", mean_slope, "standard_dev:", standard_dev, "\n zscore:", zscore

    file11.close()

    contador += 1
    file = open(name12, 'at')
    print >> file, contador, mean_slope, standard_dev, "degree"
    file.close()

    ########################################

    new_name = graph_name.split(".gml")[0]

    new_name = new_name + "_adherent_num_R6s_largest_clique.gml"

    nx.write_gml(G, new_name)
Exemple #21
0
def main(graph_name):
  
    H = nx.read_gml(graph_name)
   

    for node in H.nodes():  # i remove self loops
        if node in H.neighbors(node):          
            if len(H.neighbors(node))>1:
                H.remove_edge(node,node)             
            else:
                H.remove_node(node)              





    for node in H.nodes():        
        if H.node[node]['weigh_ins'] <5: #Adherent filter
            H.remove_node(node)
           # print node, "is going down"


    G= nx.connected_component_subgraphs(H)[0] # Giant component 
  
    print "final size of the GC:",len(G.nodes())
    
   

    #dir=graph_name.split("fr")[0]
    #dir=graph_name.split("master")[0]
    #dir=graph_name.split("method3_")[0]
    dir=graph_name.split("engaged_")[0]

    dir=dir+"roles/"
   

    print dir 

   
    time_in_system=100 #minimum amount of time in the sytem for a user to be included in the statistics

    #name=graph_name.split('data/')[1]
    name=graph_name.split('method3/')[1]

    print name
    name=name.split('.gml')[0]
   
    print name


    print dir+name


   
    name0=dir+name+"_overlap_R6s_averages_"+str(time_in_system)+"days_exclude_R6s_clinically_signif.dat"
    file0=open(name0, 'wt')    
    file0.close()
    





####for the Isolated Clusters:
    list_GC_nodes=[]
    for n in G.nodes():
        list_GC_nodes.append(n)
       # print G.node[n]['percentage_weight_change']

   # print "# users GC:",len(list_GC_nodes),"total:",len(H.nodes())

    



    list_weight_changes_not_GC=[]
    for n in H.nodes():       
        if n not in list_GC_nodes:
            #print n,"not in GC"
            list_weight_changes_not_GC.append(float(H.node[n]['percentage_weight_change'])) 

    #print  "# users not in GC:",len(list_weight_changes_not_GC)


  


    who="not_GC"
    Nbins=18
    histograma(list_weight_changes_not_GC,Nbins,dir,name,who)


 ###########################  



    list_R6s=[]     # collect the R6 of the system
    list_R6s_label=[]
    list_R6s_percent_weight_change=[] 
    for node in G.nodes() :    
        if str(G.node[node]['role']) == "R6" :
          list_R6s.append(node)
          list_R6s_label.append(G.node[node]['label'])
          list_R6s_percent_weight_change.append(float(G.node[node]['percentage_weight_change'])) 



     
    name00=dir+name+"R6s_and_top_tens_averages_"+str(time_in_system)+"days_exclude_R6s_clinically_signif.dat"
           
    file0=open(name00, 'at')
    print >> file0,"R6s",numpy.mean(list_R6s_percent_weight_change),numpy.std(list_R6s_percent_weight_change)
    file0.close()
    


  #  print "\n\n R6s:\n"
   # for i in  list_R6s_label:
    #    print i

 # studying the possible cumulative effect of more than one R6 on the population:
    for node in G.nodes():
        cont=0
        for n in  G.neighbors(node):
            if str(G.node[n]['role']) == "R6" :
                cont+=1

        G.node[node]["R6_overlap"]=int(cont)











    ##### weight change for people not connected to any R6s:####

    list_weight_changes_no_neighbors=[]
    for node in G.nodes():
        interseccion=list(set(G.neighbors(node)) & set(list_R6s))

       # print node, "intersection:",intersection,len(intersection)
     #   print "because", list_R6s, "and ",G.neighbors(node)
      #  raw_input()
        if len(interseccion)==0:
            list_weight_changes_no_neighbors.append(G.node[node]['percentage_weight_change'])
       

  #  print len(list_weight_changes_no_neighbors),"no_neighbors"



    who="no_neigbors_R6s"
    Nbins=18
    histograma(list_weight_changes_no_neighbors,Nbins,dir,name,who)


# mood test
    mood=stats.mood(list_weight_changes_no_neighbors,list_weight_changes_not_GC) 
    print "mood test for",who, "against not_GC:",mood
    
########
# K-S test:
    ks=stats.ks_2samp(list_weight_changes_no_neighbors,list_weight_changes_not_GC) 
    print "KS test for",who, "against not_GC:",ks
    
    name00="ks_results_clinically_signif.dat"
           
    file0=open(dir+name00, 'at')
    print >> file0, "KS test for",who,"of",graph_name, "against not_GC:",ks
    file0.close()
#############################################

    
    













#average percentage weight change as a function of the size of the largest CLIQUE the node belongs to:



   

    absolute_max=1
    for i in G.nodes():        
       
        maximo=1     
        list2=nx.cliques_containing_node(G, i)
       # print i, list2
       
        for elem in list2:
           # print elem,len(elem,)
            if len(elem) > maximo:
                maximo=len(elem)
       # print "\n",maximo
        G.node[i]['max_clique_size']=maximo
       
        if absolute_max < maximo:
            absolute_max = maximo


    print absolute_max

    lista=list(nx.find_cliques(G)) # crea una lista de cliques (lista de listas)
    max_clique=nx.graph_clique_number(G)  #finds out max size clique
    num_tot_clique=nx.graph_number_of_cliques(G) #finds out total number of cliques

# count number of 2, 3, 4, 5, 6  and 7cliques:

    num_2cliques=0
    num_3cliques=0
    num_4cliques=0
    num_5cliques=0
    num_6cliques=0
    num_7cliques=0
    num_8cliques=0
    num_9cliques=0


   
    for element in lista: 
        if len(element)==2:
            num_2cliques=num_2cliques +1
           
        elif len(element)==3:
            num_3cliques=num_3cliques+1
           
        elif len(element)==4:
            num_4cliques=num_4cliques+1
          
        elif len(element)==5:
            num_5cliques=num_5cliques+1
           
        elif len(element)==6:
            num_6cliques=num_6cliques+1
           
        elif len(element)==7:
            num_7cliques=num_7cliques+1            

        elif len(element)==8:
            num_8cliques=num_8cliques+1    
           
        elif len(element)==9:
            num_9cliques=num_9cliques+1
           
           


    print " 2: ",num_2cliques, "     3: ",num_3cliques, "   4: ",num_4cliques, "     5: ",num_5cliques, "   6: ",num_6cliques, "   7: ",num_7cliques, "   8: ",num_8cliques, "   9: ",num_9cliques, "   max_clique_size:",max_clique, "   num_tot_cliques:", num_tot_clique





    name33=dir+name+"_percent_weight_change_vs_largest_clique_size_clinically_signif.dat" 
    file11=open(name33, 'wt')  
    file11.close()


    cum_size_set=float(len(G.nodes()))

    tot_nodes=[]

  

    for clique_size in range(max_clique):
        clique_size=clique_size+1

        num_users_clinically_signif=0.0       


        num_users_set=cum_size_set

        percent_weight_change_that_clique_size=[]
        for n in G.nodes():

            if G.node[n]['max_clique_size']==clique_size:
                percent_weight_change_that_clique_size.append(float(G.node[n]['percentage_weight_change']))
        
                tot_nodes.append(float(G.node[n]['percentage_weight_change']))

                cum_size_set-=1.0


                if G.node [n]['percentage_weight_change']<=-5.0:
                    num_users_clinically_signif+=1.0



        try:
            file11=open(name33, 'at')  
            print >> file11,clique_size,len(percent_weight_change_that_clique_size),num_users_set/float(len(G.nodes())),num_users_clinically_signif/len(percent_weight_change_that_clique_size),numpy.mean(percent_weight_change_that_clique_size),numpy.std(percent_weight_change_that_clique_size)
            file11.close()

        except ZeroDivisionError:
            file11=open(name33, 'at')  
            print >> file11,clique_size,len(percent_weight_change_that_clique_size),num_users_set/float(len(G.nodes())),0.0 ,numpy.mean(percent_weight_change_that_clique_size),numpy.std(percent_weight_change_that_clique_size)
            file11.close()


#######################################









    #####dose effect of the R6s independently########

    name11=dir+name+"_dose_eff_indepently_only_one_R6_"+str(time_in_system)+"days_exclude_R6s.dat" 
    file11=open(name11, 'at')  
    print >> file11,0,"average_no_neighbors","average_no_neighbors","average_no_neighbors",len(list_weight_changes_no_neighbors),numpy.mean(list_weight_changes_no_neighbors),numpy.std(list_weight_changes_no_neighbors)  # the first line of the file is actually for no_neighbors, the rest, for one_and_only_one
    file11.close()
    







    file11=open(name11, 'wt')   
    file11.close()


    cont=1    
    list_all=[]
    list_all_nodes=[]
    for R6 in list_R6s:
        list_weight_changes=[]
        for n in G.neighbors(R6):
            if (G.node[n]['role'] != "R6")  and ( G.node[n]["R6_overlap"]==1) :
                list_weight_changes.append(float(G.node[n]['percentage_weight_change']))


                if n not in list_all_nodes:
                    list_all_nodes.append(n)
                    list_all.append(float(G.node[n]['percentage_weight_change']))


        if len(list_weight_changes)>0:

            file11=open(name11, 'at')  
            print >> file11,cont,G.node[R6]['role'],G.node[R6]['label'],len(G.neighbors(R6)),len(list_weight_changes),numpy.mean(list_weight_changes),numpy.std(list_weight_changes)
            file11.close()
           # print cont,G.node[R6]['role'],G.node[R6]['label'], len(G.neighbors(R6)),len(list_weight_changes),numpy.mean(list_weight_changes),numpy.std(list_weight_changes)
            cont=cont+1


          

        else:
           # file11=open(name11, 'at')  
            #print >> file11,cont,G.node[R6]['role'],G.node[R6]['label'],len(G.neighbors(R6)),len(list_weight_changes)
            #file11.close()
           # print cont,G.node[R6]['role'],G.node[R6]['label'],len(G.neighbors(R6)),len(list_weight_changes)
            cont=cont+1


    who="one_and_only_one_R6s"
    Nbins=18
    histograma(list_all,Nbins,dir,name,who)

  


  ####################################












    cum_size_set=float(len(G.nodes()))-float(len(list_R6s))
    for  r in range(len(list_R6s)+1):    
       
       # list_BMI_changes=[]               
        list_weight_changes=[]                
        list_percentage_weight_changes=[]    
        list_activities=[]
        num_users_clinically_signif=0.0        


        num_users_set=cum_size_set
        for node in G.nodes():

            if int(G.node[node]["R6_overlap"])==r:

              
                
                if G.node[node]["role"]== "R6":  # i exclude the R6s                    
                    
                    pass
                else:
                    
                    if int(G.node[node]['time_in_system']) > time_in_system:                                               
                        
                       
                     #   list_BMI_changes.append(float(G.node[node]['final_BMI'])-float(G.node[node]['initial_BMI']))
                        list_weight_changes.append(float(G.node[node]['weight_change']))
                        list_percentage_weight_changes.append(float(G.node[node]['percentage_weight_change']))
                        list_activities.append(float(G.node[node]['activity'])/float(G.node[node]['time_in_system']))
                        cum_size_set-=1.0

                        if G.node [node]['percentage_weight_change']<=-5.0:
                            num_users_clinically_signif+=1.0



        if len(list_percentage_weight_changes)>0:
           # average_BMI_change=numpy.mean(list_BMI_changes)
            average_weight_change=numpy.mean(list_weight_changes)
            average_percentage_weight_change=numpy.mean(list_percentage_weight_changes)
            average_activity=numpy.mean(list_activities)
            
            #deviation_BMI=numpy.std(list_BMI_changes)       
            deviation_weight=numpy.std(list_weight_changes)
            deviation_percentage_weight=numpy.std(list_percentage_weight_changes)
            deviation_activity=numpy.std(list_activities) 


#print out
            try:
                file0=open(name0, 'at')
                print >> file0,r,len(list_percentage_weight_changes),num_users_set/float(len(G.nodes())),num_users_clinically_signif/len(list_percentage_weight_changes),average_percentage_weight_change,deviation_percentage_weight,average_weight_change,deviation_weight,average_activity,deviation_activity
                file0.close()
       
            except ZeroDivisionError:
                file11=open(name33, 'at')  
                print >> file11,clique_size,len(percent_weight_change_that_clique_size),num_users_set/float(len(G.nodes())),0.0 ,numpy.mean(percent_weight_change_that_clique_size),numpy.std(percent_weight_change_that_clique_size)
                file11.close()









            
   #### averages for every R6's egonetwork:#########
    cont=1  
    list_all_=[]
    list_all_nodes_=[]
    for node in list_R6s:  
        neighbors=G.neighbors(node)#a list of nodes               
        
        average_BMI_change=0.0               
        list_BMI_changes=[]
        
        average_weight_change=0.0       
        list_weight_changes=[]

        average_percentage_weight_change=0.0       
        list_percentage_weight_changes=[]
         
        average_activity=0.0     # ojo! sera dividida por el numero de dias!!!!!
        list_activities=[]
          
      
        
       

        for n in G.neighbors(node):
           
            if int(G.node[n]['time_in_system']) > time_in_system:
               
                         
                
               # list_BMI_changes.append(float(G.node[n]['final_BMI'])-float(G.node[n]['initial_BMI']))
                               
                list_weight_changes.append(float(G.node[n]['weight_change']))


                list_percentage_weight_changes.append(float(G.node[n]['percentage_weight_change']))

                
                                             
                list_activities.append(float(G.node[n]['activity'])/float(G.node[n]['time_in_system']))

                if n  not in list_all_nodes_:
                    list_all_nodes_.append(n)               
                    list_all_.append(float(G.node[n]['percentage_weight_change']))

#averages 
        average_weight_change=numpy.mean(list_weight_changes)
      #  average_BMI_change=numpy.mean(list_BMI_changes)
        average_activity=numpy.mean(list_activities)
        average_percentage_weight_change=numpy.mean(list_percentage_weight_changes)

      



#standard deviation
        #deviation_BMI=numpy.std(list_BMI_changes)       
        deviation_weight=numpy.std(list_weight_changes)
        deviation_percentage_weight=numpy.std(list_percentage_weight_changes)
        deviation_activity=numpy.std(list_activities) 
        


#print out
        name2=dir+name+"_ego_R6s_average_weight_change_"+str(time_in_system)+"days.dat"
        file2=open(name2, 'at')
        print >> file2,cont,G.node[node]['role'],G.node[node]['label'],len(G.neighbors(node)),average_weight_change,deviation_weight
        file2.close()


        name22=dir+name+"_ego_R6s_average_percentage_weight_change_"+str(time_in_system)+"days.dat"
        file22=open(name22, 'at')
        print >> file22,cont,G.node[node]['role'],G.node[node]['label'],len(G.neighbors(node)),average_percentage_weight_change,deviation_percentage_weight
        file22.close()


        name3=dir+name+"_ego_R6s_average_activity_"+str(time_in_system)+"days.dat"
        file3=open(name3, 'at')
        print >> file3,cont,G.node[node]['role'],G.node[node]['label'],len(G.neighbors(node)),average_activity,deviation_activity
        file3.close()


   


        cont=cont+1




    who="R6s_egonetworks_all"
    Nbins=18
    histograma(list_all_,Nbins,dir,name,who)

  #  print "intersection:",len(set(list_all_)&set(list_all)),len(list_all_),len(list_all)
       #############just checking what happens if we remove the 40155 guy








##### percent weight change vs. role:

    list_roles=["R1","R2","R3","R4","R5","R6","R7"]

    file = open(dir+name+"_percentage_weight_change_vs_role",'wt')
    cont=1
    for role in list_roles:

        list_weight_changes_role=[]
        for n in G.nodes():
            if G.node[n]['role']==role:
                list_weight_changes_role.append(G.node[n]['percentage_weight_change'])
                
        print >> file, cont, role, len(list_weight_changes_role),numpy.mean(list_weight_changes_role),numpy.std(list_weight_changes_role)

        cont+=1

    file.close()



#############################







############## percentage weight change vs k

   
    cum_size_set=float(len(G.nodes()))

    list_k=[]
    for n in G.nodes():
        list_k.append(len(G.neighbors(n)))        

    max_k=max(list_k)

   

    file = open(dir+name+"_percentage_weight_change_vs_k_clinically_signif.dat",'wt')
    max_k=max_k+1
    for k in range(1,max_k):   


        num_users_clinically_signif=0.0        
        num_users_set=cum_size_set
    
        list_percent_weight_change_k=[]
        for n in G.nodes():
            if len(G.neighbors(n))==k:
                list_percent_weight_change_k.append(G.node[n]['percentage_weight_change'])
                cum_size_set-=1.0

                if G.node [n]['percentage_weight_change']<=-5.0:
                    num_users_clinically_signif+=1.0



        if len(list_percent_weight_change_k)>0:

            try:

                print >> file,k, len(list_percent_weight_change_k),num_users_set/float(len(G.nodes())),num_users_clinically_signif/len(list_percent_weight_change_k),numpy.mean(list_percent_weight_change_k),numpy.std(list_percent_weight_change_k)

            except ZeroDivisionError:
                file11=open(name33, 'at')  
                print >> file11,clique_size,len(percent_weight_change_that_clique_size),num_users_set/float(len(G.nodes())),0.0 ,numpy.mean(percent_weight_change_that_clique_size),numpy.std(percent_weight_change_that_clique_size)
                file11.close()
                



    file.close()
Exemple #22
0
if p_value > alpha:
    print('Same distributions (fail to reject H0)')
else:
    print('Different distributions (reject H0)')
stat, p_value = levene(dataset['Open'], dataset['Adj Close'])

print('Levene Test')
print('-' * 40)
print('Statistics=%.3f, p=%.3f' % (stat, p_value))
# interpret
alpha = 0.05
if p_value > alpha:
    print('Same distributions (fail to reject H0)')
else:
    print('Different distributions (reject H0)')
stat, p_value = mood(dataset['Open'], dataset['Adj Close'])

print('Mood Test')
print('-' * 40)
print('Statistics=%.3f, p=%.3f' % (stat, p_value))
# interpret
alpha = 0.05
if p_value > alpha:
    print('Same distributions (fail to reject H0)')
else:
    print('Different distributions (reject H0)')
stat, p_value, med, tbl = median_test(dataset['Open'], dataset['Adj Close'],
                                      dataset['Volume'])

print('Mood’s median test')
print('-' * 40)
Exemple #23
0
def custom(a, b):
    _, p = stats.mood(a, b)
    return p
Exemple #24
0
#==============================================================================
# print("mannwhitneyu")
# data['mannwhitneyu'] = [mannwhitneyu(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors),
#                                                           np.nan_to_num(question2_vectors))]
#==============================================================================

print("fligner")
data['fligner'] = [
    fligner(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors),
                                       np.nan_to_num(question2_vectors))
]

print("mood")
data['mood'] = [
    mood(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors),
                                    np.nan_to_num(question2_vectors))
]

print("ks_2samp")
data['ks_2samp'] = [
    ks_2samp(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors),
                                        np.nan_to_num(question2_vectors))
]

print("wilcoxon")
data['wilcoxon'] = [
    wilcoxon(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors),
                                        np.nan_to_num(question2_vectors))
]
Exemple #25
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o",
                        "--outfile",
                        required=True,
                        help="Path to the output file.")
    parser.add_argument("--sample_one_cols",
                        help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols",
                        help="Input format, like smi, sdf, inchi")
    parser.add_argument(
        "--sample_cols",
        help="Input format, like smi, sdf, inchi,separate arrays using ;",
    )
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help=
        "Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help=
        "If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta",
        action="store_true",
        default=False,
        help="Whether or not to return the internally computed a values.",
    )
    parser.add_argument(
        "--fisher",
        action="store_true",
        default=False,
        help="if true then Fisher definition is used",
    )
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help=
        "if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument(
        "--inclusive1",
        action="store_true",
        default=False,
        help="if false,lower_limit will be ignored",
    )
    parser.add_argument(
        "--inclusive2",
        action="store_true",
        default=False,
        help="if false,higher_limit will be ignored",
    )
    parser.add_argument(
        "--inclusive",
        action="store_true",
        default=False,
        help="if false,limit will be ignored",
    )
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help=
        "If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help=
        "Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument(
        "--correction",
        action="store_true",
        default=False,
        help="continuity correction ",
    )
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help=
        "Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help=
        "the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b",
                        type=int,
                        default=0,
                        help="The number of bins to use for the histogram")
    parser.add_argument("--N",
                        type=int,
                        default=0,
                        help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof",
                        type=int,
                        default=0,
                        help="Degrees of freedom correction")
    parser.add_argument(
        "--score",
        type=int,
        default=0,
        help="Score that is compared to the elements in a.",
    )
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help=
        "The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument(
        "--new",
        type=float,
        default=0.0,
        help="Value to put in place of values in a outside of bounds",
    )
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help=
        "lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help=
        "If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument(
        "--base",
        type=float,
        default=1.6,
        help="The logarithmic base to use, defaults to e",
    )
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols is not None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols is not None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols is not None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(
                map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one),
                                               dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one),
                                       n=args.n,
                                       p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(
                map(float, sample_one),
                axis=args.axis,
                fisher=args.fisher,
                bias=args.bias,
            )
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one),
                                        score=args.score,
                                        kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one),
                                                   alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one),
                                             low=args.m,
                                             high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one),
                cdf=args.cdf,
                N=args.N,
                alternative=args.alternative,
                mode=args.mode,
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one),
                correction=args.correction,
                lambda_=args.lambda_)
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf == 0 and mf == 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf),
                                   (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf == 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one),
                                 lowerlimit=mf,
                                 inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf == 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one),
                                 upperlimit=nf,
                                 inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf == 0 and mf == 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf),
                                 (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf == 0 and mf == 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf),
                                 (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf == 0 and mf == 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf),
                               (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf == 0 and mf == 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one),
                    map(float, sample_two),
                    interpolation_method=args.interpolation,
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one),
                    map(float, sample_two),
                    (mf, nf),
                    interpolation_method=args.interpolation,
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf == 0 and mf == 0:
                rel, low_range, binsize, ex = stats.relfreq(
                    map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(
                    map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf == 0 and mf == 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf == 0 and mf == 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one),
                                    mf,
                                    nf,
                                    newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one),
                               proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(
                map(float, sample_one),
                proportiontocut=args.proportiontocut,
                tail=args.tail,
            )
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf == 0 and mf == 0:
                hi, low_range, binsize, ex = stats.histogram(
                    map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(
                    map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf == 0 and mf == 0:
                cum, low_range, binsize, ex = stats.cumfreq(
                    map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(
                    map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf == 0 and mf == 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf),
                                          method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda == 0:
                box, ma, ci = stats.boxcox(map(float, sample_one),
                                           alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one),
                                   imbda,
                                   alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one),
                                  map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one),
                                                  map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one),
                                        map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one),
                                       map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two))
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one),
                                          map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one),
                                              map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one),
                                        map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one),
                map(float, sample_two),
                use_continuity=args.mwu_use_continuity,
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one),
                           map(float, sample_two),
                           ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(map(float, sample_one),
                                                  map(float, sample_two),
                                                  equal_var=args.equal_var)
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one),
                                      map(float, sample_two),
                                      axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one),
                                    map(float, sample_two),
                                    axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one),
                                          map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one),
                map(float, sample_two),
                initial_lexsort=args.initial_lexsort,
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one),
                              map(float, sample_two),
                              base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one),
                                               map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one),
                                               map(float, sample_two),
                                               ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one),
                                               ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one),
                    map(float, sample_two),
                    ddof=args.ddof,
                    lambda_=args.lambda_,
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one),
                                                       ddof=args.ddof,
                                                       lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one),
                                                     map(float, sample_two),
                                                     alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one),
                                                     alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one),
                    method=args.med,
                    weights=map(float, sample_two),
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one),
                                                      method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center,
                                         proportiontocut=args.proportiontocut,
                                         *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center,
                                      proportiontocut=args.proportiontocut,
                                      *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties,
                correction=args.correction,
                lambda_=args.lambda_,
                *b_samples)
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.")
    parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;")
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help="Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values."
    )
    parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used")
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help="if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored")
    parser.add_argument(
        "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored"
    )
    parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored")
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help="If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ")
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help="the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram")
    parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction")
    parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds")
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help="lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e")
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols != None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols != None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols != None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias)
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one), correction=args.correction, lambda_=args.lambda_
            )
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf is 0 and mf is 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf is 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf is 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf is 0 and mf is 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf is 0 and mf is 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf is 0 and mf is 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf is 0 and mf is 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf is 0 and mf is 0:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf is 0 and mf is 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf is 0 and mf is 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail)
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf is 0 and mf is 0:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf is 0 and mf is 0:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf is 0 and mf is 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda is 0:
                box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one), map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two)
            )
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(
                map(float, sample_one), map(float, sample_two), equal_var=args.equal_var
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one), zero_method=args.zero_method, correction=args.correction
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one), method=args.med, weights=map(float, sample_two)
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples
            )
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()
Exemple #27
0
def test_mood():
    # numbers from R: mood.test in package stats
    x1 = np.arange(5)
    assert_array_almost_equal(stats.mood(x1, x1**2),
                              (-1.3830857299399906, 0.16663858066771478), 11)
 def test_moodTest_zResult(self):
     data_1 = np.random.randint(0, 100, 1000)
     data_2 = np.random.normal(0, 100, 1000)
     z1, p1 = mood_test(data_1, data_2)
     z2, p2 = mood(data_1, data_2)
     assert pytest.approx(z2) == z1