def test_mood_3d(self): shape = (10, 5, 6) np.random.seed(1234) x1 = np.random.randn(*shape) x2 = np.random.randn(*shape) for axis in range(3): z_vectest, pval_vectest = stats.mood(x1, x2, axis=axis) # Tests that result for 3-D arrays is equal to that for the # same calculation on a set of 1-D arrays taken from the # 3-D array axes_idx = ([1, 2], [0, 2], [0, 1]) # the two axes != axis for i in range(shape[axes_idx[axis][0]]): for j in range(shape[axes_idx[axis][1]]): if axis == 0: slice1 = x1[:, i, j] slice2 = x2[:, i, j] elif axis == 1: slice1 = x1[i, :, j] slice2 = x2[i, :, j] else: slice1 = x1[i, j, :] slice2 = x2[i, j, :] assert_array_almost_equal([z_vectest[i, j], pval_vectest[i, j]], stats.mood(slice1, slice2))
def test_mood_3d(self): shape = (10, 5, 6) np.random.seed(1234) x1 = np.random.randn(*shape) x2 = np.random.randn(*shape) for axis in range(3): z_vectest, pval_vectest = stats.mood(x1, x2, axis=axis) # Tests that result for 3-D arrays is equal to that for the # same calculation on a set of 1-D arrays taken from the # 3-D array axes_idx = ([1, 2], [0, 2], [0, 1]) # the two axes != axis for i in range(shape[axes_idx[axis][0]]): for j in range(shape[axes_idx[axis][1]]): if axis == 0: slice1 = x1[:, i, j] slice2 = x2[:, i, j] elif axis == 1: slice1 = x1[i, :, j] slice2 = x2[i, :, j] else: slice1 = x1[i, j, :] slice2 = x2[i, j, :] assert_array_almost_equal( [z_vectest[i, j], pval_vectest[i, j]], stats.mood(slice1, slice2))
def test_mood_order_of_args(self): # z should change sign when the order of arguments changes, pvalue # should not change np.random.seed(1234) x1 = np.random.randn(10, 1) x2 = np.random.randn(15, 1) z1, p1 = stats.mood(x1, x2) z2, p2 = stats.mood(x2, x1) assert_array_almost_equal([z1, p1], [-z2, p2])
def test_mood_with_axis_none(self): #Test with axis = None, compare with results from R x1 = [ -0.626453810742332, 0.183643324222082, -0.835628612410047, 1.59528080213779, 0.329507771815361, -0.820468384118015, 0.487429052428485, 0.738324705129217, 0.575781351653492, -0.305388387156356, 1.51178116845085, 0.389843236411431, -0.621240580541804, -2.2146998871775, 1.12493091814311, -0.0449336090152309, -0.0161902630989461, 0.943836210685299, 0.821221195098089, 0.593901321217509 ] x2 = [ -0.896914546624981, 0.184849184646742, 1.58784533120882, -1.13037567424629, -0.0802517565509893, 0.132420284381094, 0.707954729271733, -0.23969802417184, 1.98447393665293, -0.138787012119665, 0.417650750792556, 0.981752777463662, -0.392695355503813, -1.03966897694891, 1.78222896030858, -2.31106908460517, 0.878604580921265, 0.035806718015226, 1.01282869212708, 0.432265154539617, 2.09081920524915, -1.19992581964387, 1.58963820029007, 1.95465164222325, 0.00493777682814261, -2.45170638784613, 0.477237302613617, -0.596558168631403, 0.792203270299649, 0.289636710177348 ] x1 = np.array(x1) x2 = np.array(x2) x1.shape = (10, 2) x2.shape = (15, 2) assert_array_almost_equal(stats.mood(x1, x2, axis=None), [-1.31716607555, 0.18778296257])
def nonparametric_check_for_d_similarity(df1, df2, alpha=0.01): common_features = set(df1.columns) & set(df2.columns) features_stats = [] for col in common_features: # H0=same central parameter delta_test, delta_pvalue = stats.mannwhitneyu(df1[col], df2[col]) if delta_pvalue > alpha: delta = 'Same central parameter' else: delta = 'Different central parameter' # H0=equality of the scale parameters scale1_test, scale1_pval = stats.ansari(df1[col], df2[col]) if scale1_pval > alpha: scale1 = 'Same scale AnsariTest' else: scale1 = 'Different scale AnsariTest' # H0=equality of the scale parameters scale2_test, scale2_pval = stats.mood(df1[col], df2[col]) if scale2_pval > alpha: scale2 = 'Same scale MoodTest' else: scale2 = 'Different scale MoodTest' features_stats.append([col, delta_pvalue, delta, scale1_pval, scale1, scale2_pval, scale2]) features_stats = pd.DataFrame(features_stats) features_stats.columns = ['col_name', 'delta_pval', 'delta_status', \ 'scale1_pval', 'scale1_status', 'scale2_pval', 'scale2_status'] return features_stats
def test_mood_with_axis_none(self): #Test with axis = None, compare with results from R x1 = [-0.626453810742332, 0.183643324222082, -0.835628612410047, 1.59528080213779, 0.329507771815361, -0.820468384118015, 0.487429052428485, 0.738324705129217, 0.575781351653492, -0.305388387156356, 1.51178116845085, 0.389843236411431, -0.621240580541804, -2.2146998871775, 1.12493091814311, -0.0449336090152309, -0.0161902630989461, 0.943836210685299, 0.821221195098089, 0.593901321217509] x2 = [-0.896914546624981, 0.184849184646742, 1.58784533120882, -1.13037567424629, -0.0802517565509893, 0.132420284381094, 0.707954729271733, -0.23969802417184, 1.98447393665293, -0.138787012119665, 0.417650750792556, 0.981752777463662, -0.392695355503813, -1.03966897694891, 1.78222896030858, -2.31106908460517, 0.878604580921265, 0.035806718015226, 1.01282869212708, 0.432265154539617, 2.09081920524915, -1.19992581964387, 1.58963820029007, 1.95465164222325, 0.00493777682814261, -2.45170638784613, 0.477237302613617, -0.596558168631403, 0.792203270299649, 0.289636710177348] x1 = np.array(x1) x2 = np.array(x2) x1.shape = (10, 2) x2.shape = (15, 2) assert_array_almost_equal(stats.mood(x1, x2, axis=None), [-1.31716607555, 0.18778296257])
def test_mood_2d(self): # Test if the results of mood test in 2-D case are consistent with the # R result for the same inputs. Numbers from R mood.test(). ny = 5 np.random.seed(1234) x1 = np.random.randn(10, ny) x2 = np.random.randn(15, ny) z_vectest, pval_vectest = stats.mood(x1, x2) for j in range(ny): assert_array_almost_equal([z_vectest[j], pval_vectest[j]], stats.mood(x1[:, j], x2[:, j])) # inverse order of dimensions x1 = x1.transpose() x2 = x2.transpose() z_vectest, pval_vectest = stats.mood(x1, x2, axis=1) for i in range(ny): # check axis handling is self consistent assert_array_almost_equal([z_vectest[i], pval_vectest[i]], stats.mood(x1[i, :], x2[i, :]))
def test_significance_tests(normal_obs, normal_obs_control): treatment = ab.sample(normal_obs) control = ab.sample(normal_obs_control) res = treatment.t_test(control, equal_var=True) res_expected = ttest_ind(normal_obs, normal_obs_control, equal_var=True) assert res.p_value == res_expected.pvalue assert res.statistic == res_expected.statistic res = treatment.t_test(control, equal_var=False) res_expected = ttest_ind(normal_obs, normal_obs_control, equal_var=False) assert res.p_value == res_expected.pvalue assert res.statistic == res_expected.statistic res = treatment.t_test_1samp(101) res_expected = ttest_1samp(normal_obs, 101) assert res.p_value == res_expected.pvalue assert res.statistic == res_expected.statistic res = treatment.mann_whitney_u_test(control) res_expected = mannwhitneyu(normal_obs_control, normal_obs, alternative='two-sided') assert res.p_value == pytest.approx(res_expected.pvalue, 1e-6) assert res.u_statistic == res_expected.statistic res = treatment.shapiro_test() res_expected = shapiro(normal_obs) assert res.statistic == res_expected[0] assert res.p_value == res_expected[1] res = treatment.median_test(control) res_expected = median_test(normal_obs, normal_obs_control) assert res.p_value == res_expected[1] assert res.statistic == res_expected[0] assert res.grand_median == res_expected[2] res = treatment.levene_test(control) res_expected = levene(normal_obs, normal_obs_control) assert res.p_value == res_expected.pvalue assert res.statistic == res_expected.statistic res = treatment.mood_test(control) res_expected = mood(normal_obs, normal_obs_control) assert res.p_value == res_expected[1] assert res.statistic == res_expected[0]
def vector_hypotheses(a, b): dict_stat = {} dict_pval = {} pea = pearsonr(a, b) dict_stat["pearsonr"], dict_pval["pearsonr"] = pea[0], pea[1] ran = ranksums(a, b) dict_stat["ranksums"], dict_pval["ranksums"] = ran[0], ran[1] moo = mood(a, b) dict_stat["mood"], dict_pval["mood"] = moo[0], moo[1] fli = fligner(a, b) dict_stat["fligner"], dict_pval["fligner"] = fli[0], fli[1] ans = ansari(a, b) dict_stat["ansari"], dict_pval["ansari"] = ans[0], ans[1] bar = bartlett(a, b) dict_stat["bartlett"], dict_pval["bartlett"] = bar[0], bar[1] lev = levene(a, b) dict_stat["levene"], dict_pval["levene"] = lev[0], lev[1] man = mannwhitneyu(a, b) dict_stat["mannwhitneyu"], dict_pval["mannwhitneyu"] = man[0], man[1] return dict_stat, dict_pval
def stream(data, Test, thresholds): # start monitoring at time = 20 i = 20 while (i < len(data) + 1): # increment window to be tested window = data[:i] arg, D = Test(window) # test at ith threshold value or at the last loaded threshold value if i >len(thresholds): cp = changedetected(D, thresholds[-1]) else: cp = changedetected(D, thresholds[i-1]) if cp == True: change = '' if Test == Lepage: # If test is Lepage, test to see what change was detected (either location or shit) # If the p value from eg mann whitney test is 100 times smaller than that from the mood test, we declare a location shift # If this is not met, then further investigation is needed (tested on synthetic data and the value of 100 seemed appropiate) p_val_Mood = mood(window[:arg + 1], window[arg + 1:])[1] p_val_MW = mannwhitneyu(window[:arg + 1], window[arg + 1:])[1] if 0.01 * p_val_Mood > p_val_MW: change = "Location" elif 0.01 * p_val_MW > p_val_Mood: change = "Scale" else: change = 'Needs further investigation' else: pass # return the location of the cp, index for which is was detected, True (cp was declared), what type of change return arg + 1, i - 1, True, change else: # if no change, increase the window size by 1 i += 1 # return this if no cp is found and no more datapoints to process return None, None, False, None
def __compute_mood_statistic_ts(self, ts): statistics = [ stat.mood(ts[t + 1:], ts[:t + 1])[0] for t in np.arange(len(ts) - 1) ] return pd.Series(statistics).dropna().reset_index(drop=True)
def mood((x, y)): return stats.mood(x, y)
def mood_test(obs, obs_control) -> MoodTestResult: """ :return: MoodTestResult(statistic, p_value) """ res = stats.mood(obs, obs_control) return MoodTestResult(statistic=res[0], p_value=res[1])
def test_mood(self): # numbers from R: mood.test in package stats x1 = np.arange(5) assert_array_almost_equal(stats.mood(x1, x1**2), (-1.3830857299399906, 0.16663858066771478), 11)
import pandas as pd import numpy as np import matplotlib.pyplot as plt import scipy.stats as stats from scipy.stats import ttest_ind import statsmodels.stats.api as sms GE = pd.read_csv('C:/Users/anivia/Desktop/geDJ.txt', sep="\s+", header=None, names=['date', 'open', 'high', 'low', 'close', 'vol']) SP = pd.read_csv( 'https://www.math.ust.hk/~macwyu/MAFS5110_2018-2019/MAFS5110_2018-2019/Chapter_1/sp500.txt', sep="\s+") logreturn_GE = np.diff(np.log(np.array(GE["close"]))) logreturn_sp500 = np.diff(np.log(np.array(SP["close"]))) da2 = pd.concat([pd.DataFrame(logreturn_GE), pd.DataFrame(logreturn_sp500)], axis=1) #da2.columns=['date','open','high','low','close','vol','logreturn_sp500'] #da2.index=da.index[1:] da2.columns = ["logreturn_GE", "logreturn_sp500"] da2.boxplot(column=['logreturn_GE', 'logreturn_sp500']) plt.show() print(stats.mood(logreturn_sp500, logreturn_GE)) print('H0 can be rejected, the variances are significantly different') print(ttest_ind(logreturn_sp500, logreturn_GE, equal_var=True)) print('') cm = sms.CompareMeans(sms.DescrStatsW(logreturn_sp500), sms.DescrStatsW(logreturn_GE)) print(cm.tconfint_diff())
def main(graph_name): G = nx.read_gml(graph_name) G = nx.connected_component_subgraphs(G)[0] # Giant component list_one_friend_percent_weight_change=[] list_all=[] for node in G.nodes(): if G.node[node]['label'] == "40155" : special=node for n in G.neighbors(node): if G.node[n]['time_in_system'] >100: list_all.append(float(G.node[n]['percentage_weight_change'])) print len(G.neighbors(n)) if len(G.neighbors(n))==1: list_one_friend_percent_weight_change.append(float(G.node[n]['percentage_weight_change'])) break print "final number of friendless nodes:", len(list_one_friend_percent_weight_change), "av_percent_wc:",numpy.mean(list_one_friend_percent_weight_change),numpy.std(list_one_friend_percent_weight_change) print "av_%_wc fo all",len(list_all), "neighbors of 40155:", numpy.mean(list_all), numpy.std(list_all) #dir=graph_name.split("fr")[0] dir=graph_name.split("mas")[0] original_name=graph_name.split("data/")[1] original_name=original_name.split(".gml")[0] dir=dir+"roles/"+original_name time_in_system=100 #minimum amount of time in the sytem for a user to be included in the statistics iterations=1000 #for boostrap name00=dir+"bootstrap_statistics_percent_weight_change_one_hops_R6s"+str(time_in_system)+"days_exclude_R6s.dat" list_R6s=[] # collect the R6 of the system list_R6s_labels=[] for node in G.nodes() : if str(G.node[node]['role']) == "R6" : list_R6s.append(node) list_R6s_labels.append(str(G.node[node]['label'])) list_percentage_wc_one_hop=[] list_one_hoppers=[] for node in G.nodes(): if (str(G.node[node]['role']) == "R6" ): for n in G.neighbors(node): if (str(G.node[n]['role']) != "R6" ): if int(G.node[n]['time_in_system']) > time_in_system : if n not in list_one_hoppers: list_percentage_wc_one_hop.append(float(G.node[n]['percentage_weight_change'])) list_one_hoppers.append(n) print "all one-hops:",numpy.mean(list_percentage_wc_one_hop),len(list_percentage_wc_one_hop) file0=open(name00, 'wt') print >> file0,"Percentage Weight Change for one-hop-from-R6s\n\n\n","original:",numpy.mean(list_percentage_wc_one_hop)," set size:",len(list_percentage_wc_one_hop) file0.close() # R6s en friend_graph_all.gml: 40155, 28688, 45784, 41794, 43020, 47063, 39625, 31954, 40324,40666 #R6s en master_adherent_homo.gml: 40155, 41794, 39625, 46487, 31954, 40324, 28688, 45784, 40666 for excluding in list_R6s_labels: list_percentage_wc_one_hop_excluding_oneR6=[] list_one_hoppers_excluding_oneR6=[] for node in G.nodes(): if (str(G.node[node]['role']) == "R6" ) and (str(G.node[node]['label']) != excluding ): for n in G.neighbors(node): if (str(G.node[n]['role']) != "R6" ): if int(G.node[n]['time_in_system']) > time_in_system : if n not in list_one_hoppers_excluding_oneR6: list_percentage_wc_one_hop_excluding_oneR6.append(float(G.node[n]['percentage_weight_change'])) list_one_hoppers_excluding_oneR6.append(n) actual_diff=numpy.mean(list_percentage_wc_one_hop)-numpy.mean(list_percentage_wc_one_hop_excluding_oneR6) print "\n\ndiff. all one-hops & all but one hub",excluding,":",actual_diff file0=open(name00, 'at') print >> file0," excluding",excluding, ":",numpy.mean(list_percentage_wc_one_hop_excluding_oneR6), " (diff:" , actual_diff,") set size:",len(list_percentage_wc_one_hop_excluding_oneR6) file0.close() ############################################### # boostrap routine, sampling with replacement:# ############################################### list_all=[] for i in list_percentage_wc_one_hop: list_all.append(i) for i in list_percentage_wc_one_hop_excluding_oneR6: list_all.append(i) list_synth_diff=[] for iter in range(iterations): list_synth_one_hop=sample_with_replacement(list_all,len(list_percentage_wc_one_hop)) list_synth_one_hop_excluding_oneR6s=sample_with_replacement(list_all,len(list_percentage_wc_one_hop_excluding_oneR6)) mean1=numpy.mean(list_synth_one_hop) mean2=numpy.mean(list_synth_one_hop_excluding_oneR6s) list_synth_diff.append(mean1-mean2) zscore=(actual_diff-numpy.mean(list_synth_diff))/numpy.std(list_synth_diff) print "mean_over_synth_realizations (with repl.):",numpy.mean(list_synth_diff),"zscore:",zscore file0=open(name00, 'at') print >> file0," z-score (sampling with replacement)",zscore file0.close() # boostrap routine, sampling without replacement: list_synth_diff=[] for iter in range(iterations): list_synth_one_hop=random.sample(list_all,len(list_percentage_wc_one_hop)) for i in list_all: if i not in list_synth_one_hop: list_synth_one_hop_excluding_oneR6s.append(i) mean1=numpy.mean(list_synth_one_hop) mean2=numpy.mean(list_synth_one_hop_excluding_oneR6s) list_synth_diff.append(mean1-mean2) zscore=(actual_diff-numpy.mean(list_synth_diff))/numpy.std(list_synth_diff) print "mean_over_synth_realizations (without repl.):",numpy.mean(list_synth_diff),"zscore:",zscore # mood test mood=stats.mood(list_percentage_wc_one_hop,list_percentage_wc_one_hop_excluding_oneR6) print "mood test:",mood #t test: ttest=stats.ttest_ind(list_percentage_wc_one_hop,list_percentage_wc_one_hop_excluding_oneR6, axis=0) print "t-test:",ttest #wilcoxon test SET SIZES MUST BE THE SAME # wilcoxon=stats.wilcoxon(list_percentage_wc_one_hop, list_percentage_wc_one_hop_excluding_oneR6) #print "wilcoxon test:",wilcoxon file0=open(name00, 'at') print >> file0," z-score (sampling without replacement)",zscore,"\n mood-test:",mood#,"\n wilcoxon-test:",wilcoxon,"\n" file0.close() file0=open(name00, 'at') print >> file0,"\n\n(number iterations for the bootstrap:",iterations,")" file0.close()
def main(graph_name): H = nx.read_gml(graph_name) for node in H.nodes(): # i remove self loops if node in H.neighbors(node): if len(H.neighbors(node)) > 1: H.remove_edge(node, node) else: H.remove_node(node) # for node in H.nodes(): # if H.node[node]['weigh_ins'] <5: #Adherent filter # H.remove_node(node) # print node, "is going down" G = nx.connected_component_subgraphs(H)[0] # Giant component print "size of the GC:", len( G.nodes()) #, "after filtering for adherence!!" #dir=graph_name.split("full_")[0] #dir=graph_name.split("master")[0] #dir=graph_name.split("method3_")[0] #dir=graph_name.split("method3_adh")[0] dir = graph_name.split("friends")[0] dir = dir + "roles/" time_in_system = 50 #minimum amount of time in the sytem for a user to be included in the statistics #name=graph_name.split('data/')[1] #name=graph_name.split('method3_50/interim/')[1] #name=graph_name.split('network_all_users/')[1] name = graph_name.split('5_points_network_2010/data/')[1] name = name.split('.gml')[0] name0 = dir + name + "_overlap_R6s_averages_" + str( time_in_system) + "days_exclude_R6s.dat" file0 = open(name0, 'wt') file0.close() contador = 0 name12 = dir + name + "_slopes_for_the_fits_average_weight_change.dat" file = open(name12, 'wt') file.close() ####for the Isolated Clusters: list_GC_nodes = [] for n in G.nodes(): list_GC_nodes.append(n) # print G.node[n]['percentage_weight_change'] # print "# users GC:",len(list_GC_nodes),"total:",len(H.nodes()) list_weight_changes_not_GC = [] for n in H.nodes(): if n not in list_GC_nodes: #print n,"not in GC" list_weight_changes_not_GC.append( float(H.node[n]['percentage_weight_change'])) #print "# users not in GC:",len(list_weight_changes_not_GC) # who="not_GC" #Nbins=18 #histograma(list_weight_changes_not_GC,Nbins,dir,name,who) ########################### list_R6s = [] # collect the R6 of the system list_R6s_label = [] list_R6s_percent_weight_change = [] for node in G.nodes(): if str(G.node[node]['role']) == "R6": list_R6s.append(node) list_R6s_label.append(G.node[node]['label']) list_R6s_percent_weight_change.append( float(G.node[node]['percentage_weight_change'])) name00 = dir + name + "R6s_and_top_tens_averages_" + str( time_in_system) + "days_exclude_R6s.dat" file0 = open(name00, 'at') print >> file0, "R6s", numpy.mean( list_R6s_percent_weight_change), numpy.std( list_R6s_percent_weight_change) file0.close() # print "\n\n R6s:\n" # for i in list_R6s_label: # print i # studying the possible cumulative effect of more than one R6 on the population: for node in G.nodes(): cont = 0 for n in G.neighbors(node): if str(G.node[n]['role']) == "R6": cont += 1 G.node[node]["R6_overlap"] = int(cont) ##### weight change for people not connected to any R6s:#### list_weight_changes_no_neighbors = [] for node in G.nodes(): interseccion = list(set(G.neighbors(node)) & set(list_R6s)) # print node, "intersection:",intersection,len(intersection) # print "because", list_R6s, "and ",G.neighbors(node) # raw_input() if len(interseccion) == 0: list_weight_changes_no_neighbors.append( G.node[node]['percentage_weight_change']) # print len(list_weight_changes_no_neighbors),"no_neighbors" who = "no_neigbors_R6s" Nbins = 18 histograma(list_weight_changes_no_neighbors, Nbins, dir, name, who) # mood test mood = stats.mood(list_weight_changes_no_neighbors, list_weight_changes_not_GC) print "mood test for", who, "against not_GC:", mood ######## # K-S test: ks = stats.ks_2samp(list_weight_changes_no_neighbors, list_weight_changes_not_GC) print "KS test for", who, "against not_GC:", ks name00 = "ks_results.dat" file0 = open(dir + name00, 'at') print >> file0, "KS test for", who, "of", graph_name, "against not_GC:", ks file0.close() ############################################# #average percentage weight change as a function of the size of the largest CLIQUE the node belongs to: absolute_max = 1 for i in G.nodes(): maximo = 1 list2 = nx.cliques_containing_node(G, i) # print i, list2 for elem in list2: # print elem,len(elem,) if len(elem) > maximo: maximo = len(elem) # print "\n",maximo G.node[i]['max_clique_size'] = maximo if absolute_max < maximo: absolute_max = maximo #print absolute_max lista = list( nx.find_cliques(G)) # crea una lista de cliques (lista de listas) max_clique = nx.graph_clique_number(G) #finds out max size clique num_tot_clique = nx.graph_number_of_cliques( G) #finds out total number of cliques # count number of 2, 3, 4, 5, 6 and 7cliques: num_2cliques = 0 num_3cliques = 0 num_4cliques = 0 num_5cliques = 0 num_6cliques = 0 num_7cliques = 0 num_8cliques = 0 num_9cliques = 0 for element in lista: if len(element) == 2: num_2cliques = num_2cliques + 1 elif len(element) == 3: num_3cliques = num_3cliques + 1 elif len(element) == 4: num_4cliques = num_4cliques + 1 elif len(element) == 5: num_5cliques = num_5cliques + 1 elif len(element) == 6: num_6cliques = num_6cliques + 1 elif len(element) == 7: num_7cliques = num_7cliques + 1 elif len(element) == 8: num_8cliques = num_8cliques + 1 elif len(element) == 9: num_9cliques = num_9cliques + 1 # print " 2: ",num_2cliques, " 3: ",num_3cliques, " 4: ",num_4cliques, " 5: ",num_5cliques, " 6: ",num_6cliques, " 7: ",num_7cliques, " 8: ",num_8cliques, " 9: ",num_9cliques, " max_clique_size:",max_clique, " num_tot_cliques:", num_tot_clique name33 = dir + name + "_percent_weight_change_vs_largest_clique_size.dat" file11 = open(name33, 'wt') file11.close() list_of_lists_for_bootstrap = [] x_positions_fit = [] y_positions_fit = [] cum_size_set = float(len(G.nodes())) tot_nodes = [] for clique_size in range(1, max_clique): clique_size = clique_size + 1 print clique_size num_users_set = cum_size_set percent_weight_change_that_clique_size = [] for n in G.nodes(): if G.node[n]['max_clique_size'] == clique_size: percent_weight_change_that_clique_size.append( float(G.node[n]['percentage_weight_change'])) tot_nodes.append(float(G.node[n]['percentage_weight_change'])) cum_size_set -= 1.0 file11 = open(name33, 'at') print >> file11, clique_size, len( percent_weight_change_that_clique_size), num_users_set / float( len(G.nodes())), numpy.mean( percent_weight_change_that_clique_size), numpy.std( percent_weight_change_that_clique_size) file11.close() if len(x_positions_fit) <= 7: x_positions_fit.append(clique_size) y_positions_fit.append( numpy.mean(percent_weight_change_that_clique_size)) list_of_lists_for_bootstrap.append( percent_weight_change_that_clique_size) slope, intercept, Corr_coef, p_value, std_err = stats.linregress( x_positions_fit, y_positions_fit) # least squeares polinomial fit print "result linear. fit for clique size dependency:" print "slope:", slope, "intercept:", intercept, "Corr_coef:", Corr_coef, "p_value:", p_value, "std_err:", std_err name11 = dir + name + "_fits_clique_size.dat" file11 = open(name11, 'wt') for i in range(len(x_positions_fit)): print >> file11, x_positions_fit[ i], intercept + x_positions_fit[i] * slope print >> file11, "\n\n", "y=", intercept, "+", slope, "*x", print "Bootstrap for clique size:\n" mean_slope, standard_dev = bootstrap(x_positions_fit[0], x_positions_fit[-1], list_of_lists_for_bootstrap) zscore = (slope - mean_slope) / standard_dev print >> file11, "bootstrap:\n", "actual slope:", slope, "mean_slope:", mean_slope, "standard_dev:", standard_dev, "\n zscore:", zscore print x_positions_fit[0], x_positions_fit[ -1], "actual slope:", slope, "mean_slope:", mean_slope, "standard_dev:", standard_dev, "\n zscore:", zscore file11.close() contador += 1 file = open(name12, 'at') print >> file, contador, mean_slope, standard_dev, "largest_clique_size" file.close() ####################################### #####dose effect of the R6s independently######## name11 = dir + name + "_dose_eff_indepently_only_one_R6_" + str( time_in_system) + "days_exclude_R6s.dat" file11 = open(name11, 'at') print >> file11, 0, "average_no_neighbors", "average_no_neighbors", "average_no_neighbors", len( list_weight_changes_no_neighbors ), numpy.mean(list_weight_changes_no_neighbors), numpy.std( list_weight_changes_no_neighbors ) # the first line of the file is actually for no_neighbors, the rest, for one_and_only_one file11.close() file11 = open(name11, 'wt') file11.close() cont = 1 list_all = [] list_all_nodes = [] for R6 in list_R6s: list_weight_changes = [] for n in G.neighbors(R6): if (G.node[n]['role'] != "R6") and (G.node[n]["R6_overlap"] == 1): list_weight_changes.append( float(G.node[n]['percentage_weight_change'])) if n not in list_all_nodes: list_all_nodes.append(n) list_all.append( float(G.node[n]['percentage_weight_change'])) if len(list_weight_changes) > 0: file11 = open(name11, 'at') print >> file11, cont, G.node[R6]['role'], G.node[R6][ 'label'], len( G.neighbors(R6)), len(list_weight_changes), numpy.mean( list_weight_changes), numpy.std(list_weight_changes) file11.close() # print cont,G.node[R6]['role'],G.node[R6]['label'], len(G.neighbors(R6)),len(list_weight_changes),numpy.mean(list_weight_changes),numpy.std(list_weight_changes) cont = cont + 1 else: # file11=open(name11, 'at') #print >> file11,cont,G.node[R6]['role'],G.node[R6]['label'],len(G.neighbors(R6)),len(list_weight_changes) #file11.close() # print cont,G.node[R6]['role'],G.node[R6]['label'],len(G.neighbors(R6)),len(list_weight_changes) cont = cont + 1 who = "one_and_only_one_R6s" Nbins = 18 histograma(list_all, Nbins, dir, name, who) #################################### print "\n\n" list_of_lists_for_bootstrap = [] x_positions_fit = [] y_positions_fit = [] averages_larger5_x = [] averages_larger5_y = [] norm = 0.0 cum_size_set = float(len(G.nodes())) - float(len(list_R6s)) for r in range(len(list_R6s) + 1): # list_BMI_changes=[] list_weight_changes = [] list_percentage_weight_changes = [] list_activities = [] num_users_set = cum_size_set for node in G.nodes(): if int(G.node[node]["R6_overlap"]) == r: if G.node[node]["role"] == "R6": # i exclude the R6s pass else: if int(G.node[node]['time_in_system']) > time_in_system: # list_BMI_changes.append(float(G.node[node]['final_BMI'])-float(G.node[node]['initial_BMI'])) list_weight_changes.append( float(G.node[node]['weight_change'])) list_percentage_weight_changes.append( float(G.node[node]['percentage_weight_change'])) list_activities.append( float(G.node[node]['activity']) / float(G.node[node]['time_in_system'])) cum_size_set -= 1.0 if len(list_percentage_weight_changes) > 0: # average_BMI_change=numpy.mean(list_BMI_changes) average_weight_change = numpy.mean(list_weight_changes) average_percentage_weight_change = numpy.mean( list_percentage_weight_changes) average_activity = numpy.mean(list_activities) #deviation_BMI=numpy.std(list_BMI_changes) deviation_weight = numpy.std(list_weight_changes) deviation_percentage_weight = numpy.std( list_percentage_weight_changes) deviation_activity = numpy.std(list_activities) #print out file0 = open(name0, 'at') print >> file0, r, len( list_percentage_weight_changes ), num_users_set / float( len(G.nodes()) ), average_percentage_weight_change, deviation_percentage_weight, average_weight_change, deviation_weight, average_activity, deviation_activity file0.close() if r <= 5: x_positions_fit.append(r) y_positions_fit.append(average_percentage_weight_change) list_of_lists_for_bootstrap.append( list_percentage_weight_changes) # else: # aux_x=r*len(list_percentage_weight_changes) # averages_larger5_x.append(aux_x) # aux_y=average_percentage_weight_change*len(list_percentage_weight_changes) # averages_larger5_y.append(aux_y) #norm+=float(len(list_percentage_weight_changes)) # x_positions_fit.append(numpy.mean(averages_larger5_x)/norm) # y_positions_fit.append(numpy.mean(averages_larger5_y)/norm) slope, intercept, Corr_coef, p_value, std_err = stats.linregress( x_positions_fit, y_positions_fit) # least squeares polinomial fit print "result linear. fit for dose eff.:" print "slope:", slope, "intercept:", intercept, "Corr_coef:", Corr_coef, "p_value:", p_value, "std_err:", std_err name11 = dir + name + "_fits_dose_eff_R6.dat" file11 = open(name11, 'wt') for i in range(len(x_positions_fit)): print >> file11, x_positions_fit[ i], intercept + x_positions_fit[i] * slope print >> file11, "\n\n", "y=", intercept, "+", slope, "*x", print "Bootstrap for dose eff. R6s:\n" mean_slope, standard_dev = bootstrap(x_positions_fit[0], x_positions_fit[-1], list_of_lists_for_bootstrap) zscore = (slope - mean_slope) / standard_dev print >> file11, "bootstrap:\n", "actual slope:", slope, "mean_slope:", mean_slope, "standard_dev:", standard_dev, "\n zscore:", zscore print x_positions_fit[0], x_positions_fit[ -1], "actual slope:", slope, "mean_slope:", mean_slope, "standard_dev:", standard_dev, "\n zscore:", zscore file11.close() contador += 1 file = open(name12, 'at') print >> file, contador, mean_slope, standard_dev, "dose_eff" file.close() #### averages for every R6's egonetwork:######### cont = 1 list_all_ = [] list_all_nodes_ = [] for node in list_R6s: neighbors = G.neighbors(node) #a list of nodes average_BMI_change = 0.0 list_BMI_changes = [] average_weight_change = 0.0 list_weight_changes = [] average_percentage_weight_change = 0.0 list_percentage_weight_changes = [] average_activity = 0.0 # ojo! sera dividida por el numero de dias!!!!! list_activities = [] for n in G.neighbors(node): if int(G.node[n]['time_in_system']) > time_in_system: # list_BMI_changes.append(float(G.node[n]['final_BMI'])-float(G.node[n]['initial_BMI'])) list_weight_changes.append(float(G.node[n]['weight_change'])) list_percentage_weight_changes.append( float(G.node[n]['percentage_weight_change'])) list_activities.append( float(G.node[n]['activity']) / float(G.node[n]['time_in_system'])) if n not in list_all_nodes_: list_all_nodes_.append(n) list_all_.append( float(G.node[n]['percentage_weight_change'])) #averages average_weight_change = numpy.mean(list_weight_changes) # average_BMI_change=numpy.mean(list_BMI_changes) average_activity = numpy.mean(list_activities) average_percentage_weight_change = numpy.mean( list_percentage_weight_changes) #standard deviation #deviation_BMI=numpy.std(list_BMI_changes) deviation_weight = numpy.std(list_weight_changes) deviation_percentage_weight = numpy.std(list_percentage_weight_changes) deviation_activity = numpy.std(list_activities) #print out name2 = dir + name + "_ego_R6s_average_weight_change_" + str( time_in_system) + "days.dat" file2 = open(name2, 'at') print >> file2, cont, G.node[node]['role'], G.node[node]['label'], len( G.neighbors(node)), average_weight_change, deviation_weight file2.close() name22 = dir + name + "_ego_R6s_average_percentage_weight_change_" + str( time_in_system) + "days.dat" file22 = open(name22, 'at') print >> file22, cont, G.node[node]['role'], G.node[node][ 'label'], len( G.neighbors(node) ), average_percentage_weight_change, deviation_percentage_weight file22.close() name3 = dir + name + "_ego_R6s_average_activity_" + str( time_in_system) + "days.dat" file3 = open(name3, 'at') print >> file3, cont, G.node[node]['role'], G.node[node]['label'], len( G.neighbors(node)), average_activity, deviation_activity file3.close() cont = cont + 1 who = "R6s_egonetworks_all" Nbins = 18 histograma(list_all_, Nbins, dir, name, who) # print "intersection:",len(set(list_all_)&set(list_all)),len(list_all_),len(list_all) #############just checking what happens if we remove the 40155 guy ##### percent weight change vs. role: list_roles = ["R1", "R2", "R3", "R4", "R5", "R6", "R7"] file = open(dir + name + "_percentage_weight_change_vs_role", 'wt') cont = 1 for role in list_roles: list_weight_changes_role = [] for n in G.nodes(): if G.node[n]['role'] == role: list_weight_changes_role.append( G.node[n]['percentage_weight_change']) print >> file, cont, role, len(list_weight_changes_role), numpy.mean( list_weight_changes_role), numpy.std(list_weight_changes_role) cont += 1 file.close() ############################# ############## percentage weight change vs k x_positions_fit = [] y_positions_fit = [] cum_size_set = float(len(G.nodes())) list_of_lists_for_bootstrap = [] list_k = [] for n in G.nodes(): list_k.append(len(G.neighbors(n))) max_k = max(list_k) file = open(dir + name + "_percentage_weight_change_vs_k.dat", 'wt') max_k = max_k + 1 for k in range(1, max_k): num_users_set = cum_size_set list_percent_weight_change_k = [] for n in G.nodes(): if len(G.neighbors(n)) == k: list_percent_weight_change_k.append( G.node[n]['percentage_weight_change']) cum_size_set -= 1.0 if len(list_percent_weight_change_k) > 0: print >> file, k, len( list_percent_weight_change_k), num_users_set / float( len(G.nodes())), numpy.mean( list_percent_weight_change_k), numpy.std( list_percent_weight_change_k) if len(x_positions_fit) <= 7: x_positions_fit.append(k) y_positions_fit.append( numpy.mean(list_percent_weight_change_k)) list_of_lists_for_bootstrap.append( list_percent_weight_change_k) slope, intercept, Corr_coef, p_value, std_err = stats.linregress( x_positions_fit, y_positions_fit) # least squeares polinomial fit print "result linear. fit for degree dependency:" print "slope:", slope, "intercept:", intercept, "Corr_coef:", Corr_coef, "p_value:", p_value, "std_err:", std_err file.close() name11 = dir + name + "_fits_degree.dat" file11 = open(name11, 'wt') for i in range(len(x_positions_fit)): print >> file11, x_positions_fit[ i], intercept + x_positions_fit[i] * slope print >> file11, "\n\n", "y=", intercept, "+", slope, "*x", print "Bootstrap for degree:\n" mean_slope, standard_dev = bootstrap(x_positions_fit[0], x_positions_fit[-1], list_of_lists_for_bootstrap) zscore = (slope - mean_slope) / standard_dev print >> file11, "bootstrap:\n", "actual slope:", slope, "mean_slope:", mean_slope, "standard_dev:", standard_dev, "\n zscore:", zscore print x_positions_fit[0], x_positions_fit[ -1], "actual slope:", slope, "mean_slope:", mean_slope, "standard_dev:", standard_dev, "\n zscore:", zscore file11.close() contador += 1 file = open(name12, 'at') print >> file, contador, mean_slope, standard_dev, "degree" file.close() ######################################## new_name = graph_name.split(".gml")[0] new_name = new_name + "_adherent_num_R6s_largest_clique.gml" nx.write_gml(G, new_name)
def main(graph_name): H = nx.read_gml(graph_name) for node in H.nodes(): # i remove self loops if node in H.neighbors(node): if len(H.neighbors(node))>1: H.remove_edge(node,node) else: H.remove_node(node) for node in H.nodes(): if H.node[node]['weigh_ins'] <5: #Adherent filter H.remove_node(node) # print node, "is going down" G= nx.connected_component_subgraphs(H)[0] # Giant component print "final size of the GC:",len(G.nodes()) #dir=graph_name.split("fr")[0] #dir=graph_name.split("master")[0] #dir=graph_name.split("method3_")[0] dir=graph_name.split("engaged_")[0] dir=dir+"roles/" print dir time_in_system=100 #minimum amount of time in the sytem for a user to be included in the statistics #name=graph_name.split('data/')[1] name=graph_name.split('method3/')[1] print name name=name.split('.gml')[0] print name print dir+name name0=dir+name+"_overlap_R6s_averages_"+str(time_in_system)+"days_exclude_R6s_clinically_signif.dat" file0=open(name0, 'wt') file0.close() ####for the Isolated Clusters: list_GC_nodes=[] for n in G.nodes(): list_GC_nodes.append(n) # print G.node[n]['percentage_weight_change'] # print "# users GC:",len(list_GC_nodes),"total:",len(H.nodes()) list_weight_changes_not_GC=[] for n in H.nodes(): if n not in list_GC_nodes: #print n,"not in GC" list_weight_changes_not_GC.append(float(H.node[n]['percentage_weight_change'])) #print "# users not in GC:",len(list_weight_changes_not_GC) who="not_GC" Nbins=18 histograma(list_weight_changes_not_GC,Nbins,dir,name,who) ########################### list_R6s=[] # collect the R6 of the system list_R6s_label=[] list_R6s_percent_weight_change=[] for node in G.nodes() : if str(G.node[node]['role']) == "R6" : list_R6s.append(node) list_R6s_label.append(G.node[node]['label']) list_R6s_percent_weight_change.append(float(G.node[node]['percentage_weight_change'])) name00=dir+name+"R6s_and_top_tens_averages_"+str(time_in_system)+"days_exclude_R6s_clinically_signif.dat" file0=open(name00, 'at') print >> file0,"R6s",numpy.mean(list_R6s_percent_weight_change),numpy.std(list_R6s_percent_weight_change) file0.close() # print "\n\n R6s:\n" # for i in list_R6s_label: # print i # studying the possible cumulative effect of more than one R6 on the population: for node in G.nodes(): cont=0 for n in G.neighbors(node): if str(G.node[n]['role']) == "R6" : cont+=1 G.node[node]["R6_overlap"]=int(cont) ##### weight change for people not connected to any R6s:#### list_weight_changes_no_neighbors=[] for node in G.nodes(): interseccion=list(set(G.neighbors(node)) & set(list_R6s)) # print node, "intersection:",intersection,len(intersection) # print "because", list_R6s, "and ",G.neighbors(node) # raw_input() if len(interseccion)==0: list_weight_changes_no_neighbors.append(G.node[node]['percentage_weight_change']) # print len(list_weight_changes_no_neighbors),"no_neighbors" who="no_neigbors_R6s" Nbins=18 histograma(list_weight_changes_no_neighbors,Nbins,dir,name,who) # mood test mood=stats.mood(list_weight_changes_no_neighbors,list_weight_changes_not_GC) print "mood test for",who, "against not_GC:",mood ######## # K-S test: ks=stats.ks_2samp(list_weight_changes_no_neighbors,list_weight_changes_not_GC) print "KS test for",who, "against not_GC:",ks name00="ks_results_clinically_signif.dat" file0=open(dir+name00, 'at') print >> file0, "KS test for",who,"of",graph_name, "against not_GC:",ks file0.close() ############################################# #average percentage weight change as a function of the size of the largest CLIQUE the node belongs to: absolute_max=1 for i in G.nodes(): maximo=1 list2=nx.cliques_containing_node(G, i) # print i, list2 for elem in list2: # print elem,len(elem,) if len(elem) > maximo: maximo=len(elem) # print "\n",maximo G.node[i]['max_clique_size']=maximo if absolute_max < maximo: absolute_max = maximo print absolute_max lista=list(nx.find_cliques(G)) # crea una lista de cliques (lista de listas) max_clique=nx.graph_clique_number(G) #finds out max size clique num_tot_clique=nx.graph_number_of_cliques(G) #finds out total number of cliques # count number of 2, 3, 4, 5, 6 and 7cliques: num_2cliques=0 num_3cliques=0 num_4cliques=0 num_5cliques=0 num_6cliques=0 num_7cliques=0 num_8cliques=0 num_9cliques=0 for element in lista: if len(element)==2: num_2cliques=num_2cliques +1 elif len(element)==3: num_3cliques=num_3cliques+1 elif len(element)==4: num_4cliques=num_4cliques+1 elif len(element)==5: num_5cliques=num_5cliques+1 elif len(element)==6: num_6cliques=num_6cliques+1 elif len(element)==7: num_7cliques=num_7cliques+1 elif len(element)==8: num_8cliques=num_8cliques+1 elif len(element)==9: num_9cliques=num_9cliques+1 print " 2: ",num_2cliques, " 3: ",num_3cliques, " 4: ",num_4cliques, " 5: ",num_5cliques, " 6: ",num_6cliques, " 7: ",num_7cliques, " 8: ",num_8cliques, " 9: ",num_9cliques, " max_clique_size:",max_clique, " num_tot_cliques:", num_tot_clique name33=dir+name+"_percent_weight_change_vs_largest_clique_size_clinically_signif.dat" file11=open(name33, 'wt') file11.close() cum_size_set=float(len(G.nodes())) tot_nodes=[] for clique_size in range(max_clique): clique_size=clique_size+1 num_users_clinically_signif=0.0 num_users_set=cum_size_set percent_weight_change_that_clique_size=[] for n in G.nodes(): if G.node[n]['max_clique_size']==clique_size: percent_weight_change_that_clique_size.append(float(G.node[n]['percentage_weight_change'])) tot_nodes.append(float(G.node[n]['percentage_weight_change'])) cum_size_set-=1.0 if G.node [n]['percentage_weight_change']<=-5.0: num_users_clinically_signif+=1.0 try: file11=open(name33, 'at') print >> file11,clique_size,len(percent_weight_change_that_clique_size),num_users_set/float(len(G.nodes())),num_users_clinically_signif/len(percent_weight_change_that_clique_size),numpy.mean(percent_weight_change_that_clique_size),numpy.std(percent_weight_change_that_clique_size) file11.close() except ZeroDivisionError: file11=open(name33, 'at') print >> file11,clique_size,len(percent_weight_change_that_clique_size),num_users_set/float(len(G.nodes())),0.0 ,numpy.mean(percent_weight_change_that_clique_size),numpy.std(percent_weight_change_that_clique_size) file11.close() ####################################### #####dose effect of the R6s independently######## name11=dir+name+"_dose_eff_indepently_only_one_R6_"+str(time_in_system)+"days_exclude_R6s.dat" file11=open(name11, 'at') print >> file11,0,"average_no_neighbors","average_no_neighbors","average_no_neighbors",len(list_weight_changes_no_neighbors),numpy.mean(list_weight_changes_no_neighbors),numpy.std(list_weight_changes_no_neighbors) # the first line of the file is actually for no_neighbors, the rest, for one_and_only_one file11.close() file11=open(name11, 'wt') file11.close() cont=1 list_all=[] list_all_nodes=[] for R6 in list_R6s: list_weight_changes=[] for n in G.neighbors(R6): if (G.node[n]['role'] != "R6") and ( G.node[n]["R6_overlap"]==1) : list_weight_changes.append(float(G.node[n]['percentage_weight_change'])) if n not in list_all_nodes: list_all_nodes.append(n) list_all.append(float(G.node[n]['percentage_weight_change'])) if len(list_weight_changes)>0: file11=open(name11, 'at') print >> file11,cont,G.node[R6]['role'],G.node[R6]['label'],len(G.neighbors(R6)),len(list_weight_changes),numpy.mean(list_weight_changes),numpy.std(list_weight_changes) file11.close() # print cont,G.node[R6]['role'],G.node[R6]['label'], len(G.neighbors(R6)),len(list_weight_changes),numpy.mean(list_weight_changes),numpy.std(list_weight_changes) cont=cont+1 else: # file11=open(name11, 'at') #print >> file11,cont,G.node[R6]['role'],G.node[R6]['label'],len(G.neighbors(R6)),len(list_weight_changes) #file11.close() # print cont,G.node[R6]['role'],G.node[R6]['label'],len(G.neighbors(R6)),len(list_weight_changes) cont=cont+1 who="one_and_only_one_R6s" Nbins=18 histograma(list_all,Nbins,dir,name,who) #################################### cum_size_set=float(len(G.nodes()))-float(len(list_R6s)) for r in range(len(list_R6s)+1): # list_BMI_changes=[] list_weight_changes=[] list_percentage_weight_changes=[] list_activities=[] num_users_clinically_signif=0.0 num_users_set=cum_size_set for node in G.nodes(): if int(G.node[node]["R6_overlap"])==r: if G.node[node]["role"]== "R6": # i exclude the R6s pass else: if int(G.node[node]['time_in_system']) > time_in_system: # list_BMI_changes.append(float(G.node[node]['final_BMI'])-float(G.node[node]['initial_BMI'])) list_weight_changes.append(float(G.node[node]['weight_change'])) list_percentage_weight_changes.append(float(G.node[node]['percentage_weight_change'])) list_activities.append(float(G.node[node]['activity'])/float(G.node[node]['time_in_system'])) cum_size_set-=1.0 if G.node [node]['percentage_weight_change']<=-5.0: num_users_clinically_signif+=1.0 if len(list_percentage_weight_changes)>0: # average_BMI_change=numpy.mean(list_BMI_changes) average_weight_change=numpy.mean(list_weight_changes) average_percentage_weight_change=numpy.mean(list_percentage_weight_changes) average_activity=numpy.mean(list_activities) #deviation_BMI=numpy.std(list_BMI_changes) deviation_weight=numpy.std(list_weight_changes) deviation_percentage_weight=numpy.std(list_percentage_weight_changes) deviation_activity=numpy.std(list_activities) #print out try: file0=open(name0, 'at') print >> file0,r,len(list_percentage_weight_changes),num_users_set/float(len(G.nodes())),num_users_clinically_signif/len(list_percentage_weight_changes),average_percentage_weight_change,deviation_percentage_weight,average_weight_change,deviation_weight,average_activity,deviation_activity file0.close() except ZeroDivisionError: file11=open(name33, 'at') print >> file11,clique_size,len(percent_weight_change_that_clique_size),num_users_set/float(len(G.nodes())),0.0 ,numpy.mean(percent_weight_change_that_clique_size),numpy.std(percent_weight_change_that_clique_size) file11.close() #### averages for every R6's egonetwork:######### cont=1 list_all_=[] list_all_nodes_=[] for node in list_R6s: neighbors=G.neighbors(node)#a list of nodes average_BMI_change=0.0 list_BMI_changes=[] average_weight_change=0.0 list_weight_changes=[] average_percentage_weight_change=0.0 list_percentage_weight_changes=[] average_activity=0.0 # ojo! sera dividida por el numero de dias!!!!! list_activities=[] for n in G.neighbors(node): if int(G.node[n]['time_in_system']) > time_in_system: # list_BMI_changes.append(float(G.node[n]['final_BMI'])-float(G.node[n]['initial_BMI'])) list_weight_changes.append(float(G.node[n]['weight_change'])) list_percentage_weight_changes.append(float(G.node[n]['percentage_weight_change'])) list_activities.append(float(G.node[n]['activity'])/float(G.node[n]['time_in_system'])) if n not in list_all_nodes_: list_all_nodes_.append(n) list_all_.append(float(G.node[n]['percentage_weight_change'])) #averages average_weight_change=numpy.mean(list_weight_changes) # average_BMI_change=numpy.mean(list_BMI_changes) average_activity=numpy.mean(list_activities) average_percentage_weight_change=numpy.mean(list_percentage_weight_changes) #standard deviation #deviation_BMI=numpy.std(list_BMI_changes) deviation_weight=numpy.std(list_weight_changes) deviation_percentage_weight=numpy.std(list_percentage_weight_changes) deviation_activity=numpy.std(list_activities) #print out name2=dir+name+"_ego_R6s_average_weight_change_"+str(time_in_system)+"days.dat" file2=open(name2, 'at') print >> file2,cont,G.node[node]['role'],G.node[node]['label'],len(G.neighbors(node)),average_weight_change,deviation_weight file2.close() name22=dir+name+"_ego_R6s_average_percentage_weight_change_"+str(time_in_system)+"days.dat" file22=open(name22, 'at') print >> file22,cont,G.node[node]['role'],G.node[node]['label'],len(G.neighbors(node)),average_percentage_weight_change,deviation_percentage_weight file22.close() name3=dir+name+"_ego_R6s_average_activity_"+str(time_in_system)+"days.dat" file3=open(name3, 'at') print >> file3,cont,G.node[node]['role'],G.node[node]['label'],len(G.neighbors(node)),average_activity,deviation_activity file3.close() cont=cont+1 who="R6s_egonetworks_all" Nbins=18 histograma(list_all_,Nbins,dir,name,who) # print "intersection:",len(set(list_all_)&set(list_all)),len(list_all_),len(list_all) #############just checking what happens if we remove the 40155 guy ##### percent weight change vs. role: list_roles=["R1","R2","R3","R4","R5","R6","R7"] file = open(dir+name+"_percentage_weight_change_vs_role",'wt') cont=1 for role in list_roles: list_weight_changes_role=[] for n in G.nodes(): if G.node[n]['role']==role: list_weight_changes_role.append(G.node[n]['percentage_weight_change']) print >> file, cont, role, len(list_weight_changes_role),numpy.mean(list_weight_changes_role),numpy.std(list_weight_changes_role) cont+=1 file.close() ############################# ############## percentage weight change vs k cum_size_set=float(len(G.nodes())) list_k=[] for n in G.nodes(): list_k.append(len(G.neighbors(n))) max_k=max(list_k) file = open(dir+name+"_percentage_weight_change_vs_k_clinically_signif.dat",'wt') max_k=max_k+1 for k in range(1,max_k): num_users_clinically_signif=0.0 num_users_set=cum_size_set list_percent_weight_change_k=[] for n in G.nodes(): if len(G.neighbors(n))==k: list_percent_weight_change_k.append(G.node[n]['percentage_weight_change']) cum_size_set-=1.0 if G.node [n]['percentage_weight_change']<=-5.0: num_users_clinically_signif+=1.0 if len(list_percent_weight_change_k)>0: try: print >> file,k, len(list_percent_weight_change_k),num_users_set/float(len(G.nodes())),num_users_clinically_signif/len(list_percent_weight_change_k),numpy.mean(list_percent_weight_change_k),numpy.std(list_percent_weight_change_k) except ZeroDivisionError: file11=open(name33, 'at') print >> file11,clique_size,len(percent_weight_change_that_clique_size),num_users_set/float(len(G.nodes())),0.0 ,numpy.mean(percent_weight_change_that_clique_size),numpy.std(percent_weight_change_that_clique_size) file11.close() file.close()
if p_value > alpha: print('Same distributions (fail to reject H0)') else: print('Different distributions (reject H0)') stat, p_value = levene(dataset['Open'], dataset['Adj Close']) print('Levene Test') print('-' * 40) print('Statistics=%.3f, p=%.3f' % (stat, p_value)) # interpret alpha = 0.05 if p_value > alpha: print('Same distributions (fail to reject H0)') else: print('Different distributions (reject H0)') stat, p_value = mood(dataset['Open'], dataset['Adj Close']) print('Mood Test') print('-' * 40) print('Statistics=%.3f, p=%.3f' % (stat, p_value)) # interpret alpha = 0.05 if p_value > alpha: print('Same distributions (fail to reject H0)') else: print('Different distributions (reject H0)') stat, p_value, med, tbl = median_test(dataset['Open'], dataset['Adj Close'], dataset['Volume']) print('Mood’s median test') print('-' * 40)
def custom(a, b): _, p = stats.mood(a, b) return p
#============================================================================== # print("mannwhitneyu") # data['mannwhitneyu'] = [mannwhitneyu(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors), # np.nan_to_num(question2_vectors))] #============================================================================== print("fligner") data['fligner'] = [ fligner(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] print("mood") data['mood'] = [ mood(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] print("ks_2samp") data['ks_2samp'] = [ ks_2samp(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] print("wilcoxon") data['wilcoxon'] = [ wilcoxon(x, y)[0] for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ]
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile", required=True, help="Tabular file.") parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.") parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument( "--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;", ) parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help= "Whether a continuity correction (1/2.) should be taken into account.", ) parser.add_argument( "--equal_var", action="store_true", default=False, help= "If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.", ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values.", ) parser.add_argument( "--fisher", action="store_true", default=False, help="if true then Fisher definition is used", ) parser.add_argument( "--bias", action="store_true", default=False, help= "if false,then the calculations are corrected for statistical bias", ) parser.add_argument( "--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored", ) parser.add_argument( "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored", ) parser.add_argument( "--inclusive", action="store_true", default=False, help="if false,limit will be ignored", ) parser.add_argument( "--printextras", action="store_true", default=False, help= "If True, if there are extra points a warning is raised saying how many of those points there are", ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help= "Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.", ) parser.add_argument( "--correction", action="store_true", default=False, help="continuity correction ", ) parser.add_argument( "--axis", type=int, default=0, help= "Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)", ) parser.add_argument( "--n", type=int, default=0, help= "the number of trials. This is ignored if x gives both the number of successes and failures", ) parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument( "--score", type=int, default=0, help="Score that is compared to the elements in a.", ) parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help= "The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5", ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument( "--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds", ) parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.", ) parser.add_argument( "--lambda_", type=float, default=1.0, help= "lambda_ gives the power in the Cressie-Read power divergence statistic", ) parser.add_argument( "--imbda", type=float, default=0, help= "If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.", ) parser.add_argument( "--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e", ) parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, "w+") test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols is not None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(";"): barlett_samples.append(map(int, sample.split(","))) if args.sample_one_cols is not None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(",") if args.sample_two_cols is not None: sample_two_cols = args.sample_two_cols.split(",") sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split("\t") if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(cols[int(index) - 1]) if sample2 == 1: for index in sample_two_cols: sample_two.append(cols[int(index) - 1]) if test_id.strip() == "describe": size, min_max, mean, uv, bs, bk = stats.describe( map(float, sample_one)) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == "mode": vals, counts = stats.mode(map(float, sample_one)) cols.append(vals) cols.append(counts) elif test_id.strip() == "nanmean": m = stats.nanmean(map(float, sample_one)) cols.append(m) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "kurtosistest": z_value, p_value = stats.kurtosistest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "itemfreq": freq = stats.itemfreq(map(float, sample_one)) for list in freq: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "boxcox_llf": IIf = stats.boxcox_llf(imbda, map(float, sample_one)) cols.append(IIf) elif test_id.strip() == "tiecorrect": fa = stats.tiecorrect(map(float, sample_one)) cols.append(fa) elif test_id.strip() == "rankdata": r = stats.rankdata(map(float, sample_one), method=args.md) cols.append(r) elif test_id.strip() == "nanstd": s = stats.nanstd(map(float, sample_one), bias=args.bias) cols.append(s) elif test_id.strip() == "anderson": A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist) cols.append(A2) for list in critical: cols.append(list) cols.append(",") for list in sig: cols.append(list) elif test_id.strip() == "binom_test": p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == "gmean": gm = stats.gmean(map(float, sample_one), dtype=args.dtype) cols.append(gm) elif test_id.strip() == "hmean": hm = stats.hmean(map(float, sample_one), dtype=args.dtype) cols.append(hm) elif test_id.strip() == "kurtosis": k = stats.kurtosis( map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias, ) cols.append(k) elif test_id.strip() == "moment": n_moment = stats.moment(map(float, sample_one), n=args.n) cols.append(n_moment) elif test_id.strip() == "normaltest": k2, p_value = stats.normaltest(map(float, sample_one)) cols.append(k2) cols.append(p_value) elif test_id.strip() == "skew": skewness = stats.skew(map(float, sample_one), bias=args.bias) cols.append(skewness) elif test_id.strip() == "skewtest": z_value, p_value = stats.skewtest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "sem": s = stats.sem(map(float, sample_one), ddof=args.ddof) cols.append(s) elif test_id.strip() == "zscore": z = stats.zscore(map(float, sample_one), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "signaltonoise": s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == "percentileofscore": p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind) cols.append(p) elif test_id.strip() == "bayes_mvs": c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == "sigmaclip": c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == "kstest": d, p_value = stats.kstest( map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode, ) cols.append(d) cols.append(p_value) elif test_id.strip() == "chi2_contingency": chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction, lambda_=args.lambda_) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == "tmean": if nf == 0 and mf == 0: mean = stats.tmean(map(float, sample_one)) else: mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(mean) elif test_id.strip() == "tmin": if mf == 0: min = stats.tmin(map(float, sample_one)) else: min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive) cols.append(min) elif test_id.strip() == "tmax": if nf == 0: max = stats.tmax(map(float, sample_one)) else: max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive) cols.append(max) elif test_id.strip() == "tvar": if nf == 0 and mf == 0: var = stats.tvar(map(float, sample_one)) else: var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(var) elif test_id.strip() == "tstd": if nf == 0 and mf == 0: std = stats.tstd(map(float, sample_one)) else: std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(std) elif test_id.strip() == "tsem": if nf == 0 and mf == 0: s = stats.tsem(map(float, sample_one)) else: s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(s) elif test_id.strip() == "scoreatpercentile": if nf == 0 and mf == 0: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation, ) else: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation, ) for list in s: cols.append(list) elif test_id.strip() == "relfreq": if nf == 0 and mf == 0: rel, low_range, binsize, ex = stats.relfreq( map(float, sample_one), args.b) else: rel, low_range, binsize, ex = stats.relfreq( map(float, sample_one), args.b, (mf, nf)) for list in rel: cols.append(list) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "binned_statistic": if nf == 0 and mf == 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, ) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, range=(mf, nf), ) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == "threshold": if nf == 0 and mf == 0: o = stats.threshold(map(float, sample_one), newval=args.new) else: o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new) for list in o: cols.append(list) elif test_id.strip() == "trimboth": o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut) for list in o: cols.append(list) elif test_id.strip() == "trim1": t1 = stats.trim1( map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail, ) for list in t1: cols.append(list) elif test_id.strip() == "histogram": if nf == 0 and mf == 0: hi, low_range, binsize, ex = stats.histogram( map(float, sample_one), args.b) else: hi, low_range, binsize, ex = stats.histogram( map(float, sample_one), args.b, (mf, nf)) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "cumfreq": if nf == 0 and mf == 0: cum, low_range, binsize, ex = stats.cumfreq( map(float, sample_one), args.b) else: cum, low_range, binsize, ex = stats.cumfreq( map(float, sample_one), args.b, (mf, nf)) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "boxcox_normmax": if nf == 0 and mf == 0: ma = stats.boxcox_normmax(map(float, sample_one)) else: ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method) cols.append(ma) elif test_id.strip() == "boxcox": if imbda == 0: box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha) cols.append(box) elif test_id.strip() == "histogram2": h2 = stats.histogram2(map(float, sample_one), map(float, sample_two)) for list in h2: cols.append(list) elif test_id.strip() == "ranksums": z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two)) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == "ttest_1samp": t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for list in t: cols.append(list) for list in prob: cols.append(list) elif test_id.strip() == "ansari": AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two)) cols.append(AB) cols.append(p_value) elif test_id.strip() == "linregress": slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two)) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == "pearsonr": cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two)) cols.append(cor) cols.append(p_value) elif test_id.strip() == "pointbiserialr": r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two)) cols.append(r) cols.append(p_value) elif test_id.strip() == "ks_2samp": d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two)) cols.append(d) cols.append(p_value) elif test_id.strip() == "mannwhitneyu": mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity, ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "zmap": z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "ttest_ind": mw_stats_u, p_value = stats.ttest_ind(map(float, sample_one), map(float, sample_two), equal_var=args.equal_var) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "ttest_rel": t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(t) cols.append(prob) elif test_id.strip() == "mood": z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(z) cols.append(p_value) elif test_id.strip() == "shapiro": W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta) cols.append(W) cols.append(p_value) for list in a: cols.append(list) elif test_id.strip() == "kendalltau": k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort, ) cols.append(k) cols.append(p_value) elif test_id.strip() == "entropy": s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base) cols.append(s) elif test_id.strip() == "spearmanr": if sample2 == 1: rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two)) else: rho, p_value = stats.spearmanr(map(float, sample_one)) cols.append(rho) cols.append(p_value) elif test_id.strip() == "wilcoxon": if sample2 == 1: T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two), zero_method=args.zero_method, correction=args.correction, ) else: T, p_value = stats.wilcoxon( map(float, sample_one), zero_method=args.zero_method, correction=args.correction, ) cols.append(T) cols.append(p_value) elif test_id.strip() == "chisquare": if sample2 == 1: rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof) else: rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof) cols.append(rho) cols.append(p_value) elif test_id.strip() == "power_divergence": if sample2 == 1: stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_, ) else: stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_) cols.append(stat) cols.append(p_value) elif test_id.strip() == "theilslopes": if sample2 == 1: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha) else: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == "combine_pvalues": if sample2 == 1: stat, p_value = stats.combine_pvalues( map(float, sample_one), method=args.med, weights=map(float, sample_two), ) else: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med) cols.append(stat) cols.append(p_value) elif test_id.strip() == "obrientransform": ob = stats.obrientransform(*b_samples) for list in ob: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "f_oneway": f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == "kruskal": h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == "friedmanchisquare": fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == "fligner": xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(xsq) cols.append(p_value) elif test_id.strip() == "bartlett": T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == "levene": w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(w) cols.append(p_value) elif test_id.strip() == "median_test": stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ",".join(map(str, list)) cols.append(elements) outfile.write("%s\n" % "\t".join(map(str, cols))) outfile.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile", required=True, help="Tabular file.") parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.") parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;") parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help="Whether a continuity correction (1/2.) should be taken into account.", ) parser.add_argument( "--equal_var", action="store_true", default=False, help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.", ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values." ) parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used") parser.add_argument( "--bias", action="store_true", default=False, help="if false,then the calculations are corrected for statistical bias", ) parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored") parser.add_argument( "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored" ) parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored") parser.add_argument( "--printextras", action="store_true", default=False, help="If True, if there are extra points a warning is raised saying how many of those points there are", ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.", ) parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ") parser.add_argument( "--axis", type=int, default=0, help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)", ) parser.add_argument( "--n", type=int, default=0, help="the number of trials. This is ignored if x gives both the number of successes and failures", ) parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5", ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds") parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.", ) parser.add_argument( "--lambda_", type=float, default=1.0, help="lambda_ gives the power in the Cressie-Read power divergence statistic", ) parser.add_argument( "--imbda", type=float, default=0, help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.", ) parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e") parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, "w+") test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols != None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(";"): barlett_samples.append(map(int, sample.split(","))) if args.sample_one_cols != None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(",") if args.sample_two_cols != None: sample_two_cols = args.sample_two_cols.split(",") sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split("\t") if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(cols[int(index) - 1]) if sample2 == 1: for index in sample_two_cols: sample_two.append(cols[int(index) - 1]) if test_id.strip() == "describe": size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one)) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == "mode": vals, counts = stats.mode(map(float, sample_one)) cols.append(vals) cols.append(counts) elif test_id.strip() == "nanmean": m = stats.nanmean(map(float, sample_one)) cols.append(m) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "kurtosistest": z_value, p_value = stats.kurtosistest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "itemfreq": freq = stats.itemfreq(map(float, sample_one)) for list in freq: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "boxcox_llf": IIf = stats.boxcox_llf(imbda, map(float, sample_one)) cols.append(IIf) elif test_id.strip() == "tiecorrect": fa = stats.tiecorrect(map(float, sample_one)) cols.append(fa) elif test_id.strip() == "rankdata": r = stats.rankdata(map(float, sample_one), method=args.md) cols.append(r) elif test_id.strip() == "nanstd": s = stats.nanstd(map(float, sample_one), bias=args.bias) cols.append(s) elif test_id.strip() == "anderson": A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist) cols.append(A2) for list in critical: cols.append(list) cols.append(",") for list in sig: cols.append(list) elif test_id.strip() == "binom_test": p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == "gmean": gm = stats.gmean(map(float, sample_one), dtype=args.dtype) cols.append(gm) elif test_id.strip() == "hmean": hm = stats.hmean(map(float, sample_one), dtype=args.dtype) cols.append(hm) elif test_id.strip() == "kurtosis": k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias) cols.append(k) elif test_id.strip() == "moment": n_moment = stats.moment(map(float, sample_one), n=args.n) cols.append(n_moment) elif test_id.strip() == "normaltest": k2, p_value = stats.normaltest(map(float, sample_one)) cols.append(k2) cols.append(p_value) elif test_id.strip() == "skew": skewness = stats.skew(map(float, sample_one), bias=args.bias) cols.append(skewness) elif test_id.strip() == "skewtest": z_value, p_value = stats.skewtest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "sem": s = stats.sem(map(float, sample_one), ddof=args.ddof) cols.append(s) elif test_id.strip() == "zscore": z = stats.zscore(map(float, sample_one), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "signaltonoise": s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == "percentileofscore": p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind) cols.append(p) elif test_id.strip() == "bayes_mvs": c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == "sigmaclip": c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == "kstest": d, p_value = stats.kstest( map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode ) cols.append(d) cols.append(p_value) elif test_id.strip() == "chi2_contingency": chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction, lambda_=args.lambda_ ) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == "tmean": if nf is 0 and mf is 0: mean = stats.tmean(map(float, sample_one)) else: mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(mean) elif test_id.strip() == "tmin": if mf is 0: min = stats.tmin(map(float, sample_one)) else: min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive) cols.append(min) elif test_id.strip() == "tmax": if nf is 0: max = stats.tmax(map(float, sample_one)) else: max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive) cols.append(max) elif test_id.strip() == "tvar": if nf is 0 and mf is 0: var = stats.tvar(map(float, sample_one)) else: var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(var) elif test_id.strip() == "tstd": if nf is 0 and mf is 0: std = stats.tstd(map(float, sample_one)) else: std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(std) elif test_id.strip() == "tsem": if nf is 0 and mf is 0: s = stats.tsem(map(float, sample_one)) else: s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(s) elif test_id.strip() == "scoreatpercentile": if nf is 0 and mf is 0: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation ) else: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation ) for list in s: cols.append(list) elif test_id.strip() == "relfreq": if nf is 0 and mf is 0: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b) else: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf)) for list in rel: cols.append(list) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "binned_statistic": if nf is 0 and mf is 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b ) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, range=(mf, nf), ) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == "threshold": if nf is 0 and mf is 0: o = stats.threshold(map(float, sample_one), newval=args.new) else: o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new) for list in o: cols.append(list) elif test_id.strip() == "trimboth": o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut) for list in o: cols.append(list) elif test_id.strip() == "trim1": t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail) for list in t1: cols.append(list) elif test_id.strip() == "histogram": if nf is 0 and mf is 0: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b) else: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf)) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "cumfreq": if nf is 0 and mf is 0: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b) else: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf)) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "boxcox_normmax": if nf is 0 and mf is 0: ma = stats.boxcox_normmax(map(float, sample_one)) else: ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method) cols.append(ma) elif test_id.strip() == "boxcox": if imbda is 0: box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha) cols.append(box) elif test_id.strip() == "histogram2": h2 = stats.histogram2(map(float, sample_one), map(float, sample_two)) for list in h2: cols.append(list) elif test_id.strip() == "ranksums": z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two)) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == "ttest_1samp": t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for list in t: cols.append(list) for list in prob: cols.append(list) elif test_id.strip() == "ansari": AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two)) cols.append(AB) cols.append(p_value) elif test_id.strip() == "linregress": slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two) ) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == "pearsonr": cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two)) cols.append(cor) cols.append(p_value) elif test_id.strip() == "pointbiserialr": r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two)) cols.append(r) cols.append(p_value) elif test_id.strip() == "ks_2samp": d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two)) cols.append(d) cols.append(p_value) elif test_id.strip() == "mannwhitneyu": mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "zmap": z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "ttest_ind": mw_stats_u, p_value = stats.ttest_ind( map(float, sample_one), map(float, sample_two), equal_var=args.equal_var ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "ttest_rel": t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(t) cols.append(prob) elif test_id.strip() == "mood": z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(z) cols.append(p_value) elif test_id.strip() == "shapiro": W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta) cols.append(W) cols.append(p_value) for list in a: cols.append(list) elif test_id.strip() == "kendalltau": k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort ) cols.append(k) cols.append(p_value) elif test_id.strip() == "entropy": s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base) cols.append(s) elif test_id.strip() == "spearmanr": if sample2 == 1: rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two)) else: rho, p_value = stats.spearmanr(map(float, sample_one)) cols.append(rho) cols.append(p_value) elif test_id.strip() == "wilcoxon": if sample2 == 1: T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two), zero_method=args.zero_method, correction=args.correction, ) else: T, p_value = stats.wilcoxon( map(float, sample_one), zero_method=args.zero_method, correction=args.correction ) cols.append(T) cols.append(p_value) elif test_id.strip() == "chisquare": if sample2 == 1: rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof) else: rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof) cols.append(rho) cols.append(p_value) elif test_id.strip() == "power_divergence": if sample2 == 1: stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_ ) else: stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_) cols.append(stat) cols.append(p_value) elif test_id.strip() == "theilslopes": if sample2 == 1: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha) else: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == "combine_pvalues": if sample2 == 1: stat, p_value = stats.combine_pvalues( map(float, sample_one), method=args.med, weights=map(float, sample_two) ) else: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med) cols.append(stat) cols.append(p_value) elif test_id.strip() == "obrientransform": ob = stats.obrientransform(*b_samples) for list in ob: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "f_oneway": f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == "kruskal": h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == "friedmanchisquare": fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == "fligner": xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(xsq) cols.append(p_value) elif test_id.strip() == "bartlett": T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == "levene": w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(w) cols.append(p_value) elif test_id.strip() == "median_test": stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples ) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ",".join(map(str, list)) cols.append(elements) outfile.write("%s\n" % "\t".join(map(str, cols))) outfile.close()
def test_mood(): # numbers from R: mood.test in package stats x1 = np.arange(5) assert_array_almost_equal(stats.mood(x1, x1**2), (-1.3830857299399906, 0.16663858066771478), 11)
def test_moodTest_zResult(self): data_1 = np.random.randint(0, 100, 1000) data_2 = np.random.normal(0, 100, 1000) z1, p1 = mood_test(data_1, data_2) z2, p2 = mood(data_1, data_2) assert pytest.approx(z2) == z1