def ANM_predict_causality(self,train_size=0.5,independence_criterion='HSIC',metric='linear'): ''' Prediction of causality based on the bivariate additive noise model Parameters ---------- independence_criterion : kruskal for Kruskal-Wallis H-test, HSIC for Hilbert-Schmidt Independence Criterion Returns ------- Causal-direction: 1 if X causes Y, or -1 if Y causes X ''' Xtrain, Xtest , Ytrain, Ytest = train_test_split(self.X, self.Y, train_size = train_size) #_gp = KernelRidge(kernel='rbf',degree=3)#GaussianProcess()# #Forward case #_gp.fit(Xtrain,Ytrain) #errors_forward = _gp.predict(Xtest) - Ytest _gp = pyGPs.GPR() _gp.getPosterior(Xtrain, Ytrain) _gp.optimize(Xtrain, Ytrain) ym, ys2, fm, fs2, lp = _gp.predict(Xtest) errors_forward = ym - Ytest #Backward case #_gp.fit(Ytrain,Xtrain) #errors_backward = _gp.predict(Ytest) - Xtest _gp = pyGPs.GPR() _gp.getPosterior(Ytrain, Xtrain) _gp.optimize(Ytrain, Xtrain) ym, ys2, fm, fs2, lp = _gp.predict(Ytest) errors_backward = ym - Xtest #Independence score forward_indep_pval = { 'kruskal': kruskal(errors_forward,Xtest)[1], 'HSIC': self.HilbertSchmidtNormIC(errors_forward,Xtest)[1] }[independence_criterion] backward_indep_pval = { 'kruskal': kruskal(errors_backward,Ytest)[1], 'HSIC': self.HilbertSchmidtNormIC(errors_backward,Ytest)[1] }[independence_criterion] #print 'Scores:', forward_indep_pval, backward_indep_pval #Warning it should be < if forward_indep_pval > backward_indep_pval: self.causal_direction = 1 self.pvalscore = forward_indep_pval else: self.causal_direction = -1 self.pvalscore = backward_indep_pval return {'causal_direction':self.causal_direction,'pvalscore':self.pvalscore,'difways':abs(forward_indep_pval-backward_indep_pval)}
def gene_kruskal(dataframe, grouping, gene, just_tumors=False): statsummary = {} values = {} groups = [] # Summarize statistics by group for key, group in dataframe.groupby(grouping): # print(key) # print(group.index.values) groups.append(key) groupstats = group[gene].describe().to_dict() values[key] = group[gene].tolist() statsummary[key] = groupstats # for x in statsummary: # print(x) # print("N: %d" % int(statsummary[x]['count'])) # print("Median: %.3f" % statsummary[x]['50%']) # print("25th Perc: %.3f" % statsummary[x]['25%']) # print("75th Perc: %.3f" % statsummary[x]['75%']) # print() # find p-values - kruskal-wallis followed by sequential independent kruskal tests with correction pvalues = {} h, p = stats.kruskal(*[values[key] for key in values]) pvalues["Kruskal-Wallis"] = p # print("Kruskal-Wallis: %.3g" % p) #performs independent kruskal-wallis tests between all subgroups starter = 0 ender = len(groups) k_pvalues = {} while starter < ender: counter = starter + 1 while counter < ender: key = groups[starter] + " vs " + groups[counter] h, p = stats.kruskal(values[groups[starter]], values[groups[counter]]) k_pvalues[key] = p counter += 1 starter += 1 #Benjamimi-Hochberg FDR correction: Pcorrected = (Poriginal * n)/k #http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3263024/ adjusted_k_pvalues = {} for i, x in enumerate(sorted(k_pvalues, key=k_pvalues.__getitem__)): adjusted_k_pvalues[x] = (k_pvalues[x] * len(k_pvalues))/(i+1) for x in adjusted_k_pvalues: pvalues[x] = adjusted_k_pvalues[x] # print("%s p-value: %.3g" % (x, adjusted_k_pvalues[x])) # print() return statsummary, pvalues
def kruskal_scipy_stats_tidy_df_wrapper(tidy_df, indep_var="sample_id", dep_var="mean_intensity"): """ Task ---- Perform kruskal wallis to determine if significant difference between groups. Input ----- Takes tidy DataFrame, independent variable (str) and dependent variable (str). Returns ------- statistic : float The Kruskal-Wallis H statistic, corrected for ties. pvalue : float The p-value for the test using the assumption that H has a chi square distribution. """ data = [ tidy_df.loc[ids, dep_var].values for ids in tidy_df.groupby(indep_var).groups.values() ] return stats.kruskal(*data)
def test(phenotypes, phen_dict, exp_d, phenotype_datatype, use_parametric): assert phenotype_datatype in ['binary', 'categorical', 'continuous'], 'Phenotype with unsupported data type' assert not(phenotype_datatype == 'binary' and len(phen_dict) > 2), \ 'Phenotype data type is binary but phenotype has more than two unique values' group_data = {} phen_arr = np.array(phenotypes) for phen in phen_dict: group_data[phen] = exp_d[:, phen_arr == phen] n_genes = exp_d.shape[0] z = np.zeros((n_genes, 2), dtype=np.float) for gene_index in range(n_genes): args = [np.transpose(group_data[phen][gene_index]) for phen in phen_dict] if use_parametric: if phenotype_datatype == 'binary': z[gene_index] = stats.ttest_ind(*args, equal_var=False) elif phenotype_datatype == 'categorical': z[gene_index] = stats.f_oneway(*args) else: if phenotype_datatype == 'binary': z[gene_index] = stats.mannwhitneyu(*args, alternative='two-sided') elif phenotype_datatype == 'categorical': z[gene_index] = stats.kruskal(*args) z1 = z[:, 1] z1[np.isnan(z1)] = 1 z1 = [np.nan_to_num(v) for v in z1] return z1
def get_num_p_value(obs_list) -> str: if len(obs_list) == 2: if check_norm_distribute(*obs_list): method = 'T-test' _, p_value = ttest_ind(obs_list[0], obs_list[1]) else: method = 'Wilcoxon rank-sum' _, p_value = ranksums(obs_list[0], obs_list[1]) else: if check_norm_distribute(*obs_list): method = 'One-way ANOVA' _, p_value = f_oneway(*obs_list) else: method = 'Kruskal-Wallis H-test' _, p_value = kruskal(*obs_list) sig_rank = '' if 0.01 <= p_value < 0.05: sig_rank = '*' elif p_value < 0.01: sig_rank = '**' if p_value < 0.0001: result = '<0.0001{} ({})'.format(sig_rank, method) else: result = '{:.4f}{} ({})'.format(float(p_value), sig_rank, method) return result, p_value
def snapshots(data, indices,basepath=None, data_label='data'): indices = zip(indices,indices[1:]) for start_idx,stop_idx in indices: initial_distribution = data[:,start_idx] final_distribution = data[:,stop_idx] fig = plt.figure() ax = fig.add_subplot(111) ax.hist(initial_distribution,color='r',alpha=0.5,bins=20,label='Initial', range=(-1,1)) ax.hist(final_distribution,color='k',alpha=0.5,bins=20,label='Final',range=(-1,1)) artist.adjust_spines(ax) ax.set_xlabel(artist.format(data_label)) ax.set_ylabel(artist.format('Prevalence')) H,p =kruskal(initial_distribution,final_distribution) effect_size = np.linalg.norm(final_distribution-initial_distribution) ax.annotate('\Large $d=%.02f, \; p=%.04f$'%(effect_size,p), xy=(.3, .9), xycoords='axes fraction', horizontalalignment='right', verticalalignment='top') plt.tight_layout() plt.legend(frameon=False) filename = os.path.join(basepath,'%s-compare-%d-%d.png'%(data_label,start_idx,stop_idx)) plt.savefig(filename,dpi=300) plt.close()
def _is_drifting(self): y_pred_range = len(self._new_values) n_steps = self._config.n_steps new_dist = np.concatenate([self._train.y.values[-n_steps+y_pred_range:], self._new_values.y.values]) old_dist = self._train.y.values[-self._yearly_freq+y_pred_range:-self._yearly_freq+y_pred_range+n_steps] print("Length's: " + str(len(new_dist)) + ", " + str(len(old_dist))) _new = np.concatenate([self._train.index.values[-n_steps+y_pred_range:], self._new_values.index.values]) _old = self._train.index.values[-self._yearly_freq+y_pred_range:-self._yearly_freq+y_pred_range+n_steps] print("Ranges: " + str(min(_new)) + " - " + str(max(_new)) + ", " + str(min(_old)) + " - " + str(max(_old))) stat, p = kruskal(old_dist, new_dist) if self._config.verbose > 1: print('Statistics=%.3f, p=%.3f' % (stat, p)) alpha = 0.05 # TODO: add in class as param if p > alpha: if self._config.verbose > 1: print('Same distributions (fail to reject H0)') return False else: if self._config.verbose > 1: print('Different distributions (reject H0)') return True
def solve(problem, cloning_param, mutation): final_data = [] final_problem = problem(dim) final_mutation = mutation(mut_pb, 20) for x in range(repetitions): algorithm = CloneAlg( problem=final_problem, population_size=100, offspring_population_size=100, mutation=final_mutation, cloning_param=cloning_param, termination_criterion=StoppingByEvaluations(max_evaluations=5000)) data = [] dataobserver = DataObserver(1.0, data) algorithm.observable.register(observer=dataobserver) algorithm.run() final_data.append(data) trans_list = np.array(final_data).T.tolist() fig = plt.figure(figsize=(10, 7)) ax = fig.add_axes([0, 0, 1, 1]) bp = ax.boxplot(trans_list) plt.title( "Problem: {0} benchmark, dim: {1}, cloning_param: {2}, mutation: {3}". format(final_problem.get_name(), dim, algorithm.get_cloning_param(), final_mutation.get_name())) plt.show() # Kruskal-Wallis and Dunn tests print(stats.kruskal(trans_list[0], trans_list[1], trans_list[-1])) sp.posthoc_dunn([trans_list[0], trans_list[1], trans_list[-1]], p_adjust='holm')
def save_mni_kruskall_table_csv(csv_path: str, experiments_list: List[ExperimentLoader], alpha=0.01): n_snp = experiments_list[0].dataset['snapshot_count'] mni_exp_list = [exp.get_mni_matrix() for exp in experiments_list] empty_str = "" for i in range(n_snp): data = [nmi[:, i] for nmi in mni_exp_list] try: _, p = kruskal(*data) if p <= alpha: # reject the null hypothesis, are not the same empty_str += "\u2714" else: # cannot reject the null hypothesis empty_str += "\u2716" except ValueError: # if all the values are the same then don't reject the null hypothesis empty_str += "\u2592" with open(csv_path, 'w') as f: f.write(empty_str) print(empty_str)
def _save_default_kruskall_hypothesis_text( text_path: str, experiments_matrix: List[List[ExperimentLoader]], labels: List[str], datasets: List[str], alpha: float, data_method: Callable[[List[ExperimentLoader]], List[np.array]]): assert len(experiments_matrix) == len( datasets ), "first dimension of experiment matrix must have the same length as datasets" assert len(experiments_matrix[0]) == len( labels ), "second dimension of experiment matrix must have the same length as labels" empty_str = "" for exp_list in experiments_matrix: data = data_method(exp_list) try: _, p = kruskal(*data) if p <= alpha: # reject the null hypothesis, are not the same empty_str += "\u2714" else: # cannot reject the null hypothesis empty_str += "\u2716" except ValueError: # if all the values are the same then don't reject the null hypothesis empty_str += "\u2592" print(empty_str) with open(text_path, 'w') as f: f.write(empty_str)
def run_stats(input_df): """Run Kruskal-Wallis H test. This is analogous to 1 way ANOVA but for non-parametric applications. The conover test is used for post-hoc testing to determine relationship between variables. NOTE that the post hoc tests should only be used when there is a significant result of the omnibus test.""" #deal with cases where all vals in a col are nan input_df = input_df.dropna(axis=1, how='all') #set inf to nan input_df = input_df.replace(np.inf, np.nan) if input_df.isnull().all().all(): return None #reformat the df cols into arrays to pass to the stats func data = [ input_df[column].to_numpy() for column in input_df.columns if not column == 'huc8' ] #run the kruskal-wallis H, p = stats.kruskal(*data, nan_policy='omit') #print(H,p) try: #run the post-hoc test #conover = sp.posthoc_conover([input_df.dropna().iloc[:,0].values,input_df.dropna().iloc[:,1].values,input_df.dropna().iloc[:,2].values,input_df.dropna().iloc[:,3].values],p_adjust='holm') conover = sp.posthoc_conover(data, p_adjust='holm') conover.columns = input_df.columns conover.index = input_df.columns return H, p, conover except Exception as e: print('Error is: ', e)
def summarize_he( analytical_sets ): results = {} he = {} for analytical_set in analytical_sets: he[analytical_set.label] = calculate_he(analytical_set.allele_df) he_df = DataFrame( he ) labels = list(he_df.columns) if len(labels) == 2: # use Mann-Whitney / Wilcoxon test results['test'] = 'Wilcoxon test (paired)' results['stats'] = wilcoxon( he_df[labels[0]], he_df[labels[1]]) elif len(labels) > 2: # use Kruskal Wallis results['test'] = 'Kruskal-Wallis test' results['stats'] = kruskal( * [he_df[x] for x in labels]) results['warning'] = '' results['data'] = he_df results['mean'] = he_df.mean() results['stddev'] = he_df.std() #raise RuntimeError return results
def test_KW(df, control, stats_table, col='logdwell'): kw = [] pv_ori = [] pv = [] pv1 = [] L = len(np.sort(df.pos.unique())) x = np.linspace(0, L, L) for pos in range(0, L): indx = df['pos'] == pos df_indx = df[indx] kmer = df_indx['kmer'].iloc[0] indx = control['kmer'] == kmer df_control_indx = control[indx] if len(df_control_indx) > 0: df_indx_dwell = df_indx[col] df_indx_dwell.reset_index(drop=True, inplace=True) df_control_indx_dwell = df_control_indx[col] df_control_indx_dwell.reset_index(drop=True, inplace=True) kw_results = stats.kruskal(df_indx_dwell, df_control_indx_dwell) kw.append(kw_results[0]) pv_ori.append(kw_results[1]) else: kw.append(0) pv_ori.append(0) stats_table['KW_' + col] = kw print(stats_table.head()) return stats_table
def rankTest(arg): ou=[] ou.append(stats.kruskal(data[arg][1],data[arg][2],data[arg][3])[1]) ou.append(stats.mannwhitneyu(data[arg][1],data[arg][2])[1]) ou.append(stats.mannwhitneyu(data[arg][1],data[arg][3])[1]) ou.append(stats.mannwhitneyu(data[arg][2],data[arg][3])[1]) return ou
def kruskal2df(a, b): assert len(a) == len(b) ua = np.unique(a) b_lst = [b[np.where(a == aa)[0]] for aa in ua] test = stats.kruskal(*b_lst) res = pd.DataFrame({'stat': test[0], 'pval': test[1]}, index=[0]) return res
def kruskal_test(benchmark_snapshot_df): """Returns p-value for Kruskal test.""" groups = benchmark_snapshot_df.groupby('fuzzer') sample_groups = groups['edges_covered'].apply(list).values _, p_value = ss.kruskal(*sample_groups) return p_value
def conover_inman_procedure(data, alpha=0.05): num_runs = len(data) num_algos = len(data.columns) N = num_runs * num_algos _, p_value = stats.kruskal(*[data[col] for col in data.columns]) ranked = stats.rankdata(np.concatenate([data[col] for col in data.columns])) ranksums = [] for i in range(num_algos): ranksums.append(np.sum(ranked[num_runs * i : num_runs * (i + 1)])) S_sq = (np.sum(ranked ** 2) - N * ((N + 1) ** 2) / 4) / (N - 1) right_side = stats.t.cdf(1 - (alpha / 2), N - num_algos) * math.sqrt( (S_sq * ((N - 1 - p_value) / (N - 1))) * 2 / num_runs ) res = pd.DataFrame(columns=data.columns, index=data.columns) for i, j in itertools.combinations(np.arange(num_algos), 2): res[res.columns[i]].ix[j] = abs(ranksums[i] - ranksums[j] / num_runs) > right_side res[res.columns[j]].ix[i] = abs(ranksums[i] - ranksums[j] / num_runs) > right_side return res
def getStats(tData, datasetLabels, param, labels, pNormMin, verbose=False): c = datasetLabels[0] e1 = datasetLabels[1] e2 = datasetLabels[2] statsData = [] statsData.append(['Test and Parameter', 'p-Value', 'p-Value', 'p-Value']) statsData.append( ['', c + ' vs. ' + e1, c + ' vs. ' + e2, e1 + ' vs. ' + e2]) for i in xrange(len(tData)): label = '---' + param + '_' + labels[i] + '---' print label normP = [] for j in xrange(len(tData[i])): _, pValue = stats.normaltest(tData[i][j]) normP.append(pValue) if min(normP) < pNormMin: testUsed = 'Kruskal-Wallis test' _, statsP = stats.kruskal(*tData[i]) print testUsed + ' pValue:', statsP, '---' multiCompP = getKWmultiComp(tData[i], datasetLabels, verbose) else: testUsed = 'One Way ANOVA' _, statsP = stats.f_oneway(*tData[i]) print testUsed + ' pValue:', statsP multiCompP = list( getOWANOVAmultiComp(tData[i], datasetLabels, verbose)) statsData.append([label]) statsData.append(['normalityTestStats'] + normP) statsData.append([testUsed, statsP]) statsData.append(['MultipleComparisons p-Value'] + multiCompP) statsData.append([]) return statsData
def rank_sum(x, targets, method='ranksum', cutoff=.05): if isinstance(targets[0], str): targets = (np.array(targets) == 'Recurrer').astype('float') else: targets = np.array(targets) pval = [] teststat = [] for i in range(x.shape[1]): xin = np.array(x)[:, i] X = xin[targets == 1] Y = xin[targets == 0] # xin1 = (xin - np.min(xin,0))/(np.max(xin,0)-np.min(xin,0)) if method == 'ranksum': s, p = st.ranksums(X, Y) elif method == 'kruskal': try: s, p = st.kruskal(X, Y) except: p = 1 elif method == 'ttest': s, p = st.ttest_ind(X, Y) pval.append(p) teststat.append(s) pval = np.array(pval) pval[np.isnan(pval)] = 1 # corrected, alpha = bh_corr(np.array(pval), .05) reject, corrected, a1, a2 = multipletests(pval, alpha=.05, method='fdr_bh') df = pd.DataFrame(np.vstack((pval, corrected, teststat)).T, columns=['P_Val', 'BH corrected', 't-stat'], index=x.columns.values) return df.sort_values('P_Val', ascending=True)
def test_kruskalWallis_hResult(self): x1 = [27, 2, 4, 18, 7, 9] x2 = [20, 8, 14, 36, 21, 22] x3 = [34, 31, 3, 23, 30, 6] h, p = kruskal_wallis_test(x1, x2, x3) h2, p2 = kruskal(x1, x2, x3) assert pytest.approx(h) == h2
def extractAssessmentResultOfCommunities(community, assessment, column): result = [] for cSize in community: extractedResult = [] groups = [] # normTest = [] for c in cSize: temp = assessment.loc[assessment.index.isin(c)] extractedResult.append((temp[column].mean(), temp[column].std())) # if len(cSize) == 8: # k2, p = stats.normaltest(temp[column]) # normTest.append((k2, p)) groups.append(temp[column]) if len(groups) == 5: f, p = f_oneway( groups[0], groups[1], groups[2], groups[3], groups[4] ) #, groups[5], groups[6] , groups[7] )#, groups[8], groups[9]) # ,groups[10], groups[11], groups[12], groups[13], groups[14] , groups[15], groups[16], groups[17], groups[18], groups[19]) L, pL = stats.levene( groups[0], groups[1], groups[2], groups[3], groups[4] ) # , groups[5], groups[6] , groups[7])#, groups[8], groups[9]) #,groups[10], groups[11], groups[12], groups[13], groups[14] , groups[15], groups[16], groups[17], groups[18], groups[19]) fk, pk = stats.kruskal( groups[0], groups[1], groups[2], groups[3], groups[4] ) #, groups[5], groups[6] , groups[7])#, groups[8], groups[9]) #,groups[10], groups[11], groups[12], groups[13], groups[14] , groups[15], groups[16], groups[17], groups[18], groups[19]) result.append([ len(cSize), extractedResult, (f, p), (L, pL), (fk, pk), groups ]) else: result.append([len(cSize), extractedResult, groups]) return result
def non_parametric_tests(self, data1, data2, test_type): # Tests whether the distributions of two independent samples are equal or not. # Observations in each sample are independent and identically distributed (iid). # Observations in each sample can be ranked. # H0: the distributions of both samples are equal. # H1: the distributions of both samples are not equal. if test_type == 'mannwhitneyu': stat, p = mannwhitneyu(data1, data2) if p > 0.05: print('Probably the same distribution') else: print('Probably different distributions') elif test_type == 'wilcoxon': stat, p = wilcoxon(data1, data2) if p > 0.05: print('Probably the same distribution') else: print('Probably different distributions') elif test_type == 'kruskal': stat, p = kruskal(data1, data2) if p > 0.05: print('Probably the same distribution') else: print('Probably different distributions') elif test_type == 'friedmanchisquare': stat, p = friedmanchisquare(data1, data2) if p > 0.05: print('Probably the same distribution') else: print('Probably different distributions')
def test_once(df_orig, df_impute, test='wilcoxon'): ''' Input: df_orig: The original dataset with missing value df_impute: The dataset after imputation test: The statistics test used Output: A numpy array containing the p-values of the tests on each column in the column order ''' cols = df_orig.columns pvals = np.array([]) if test == 'wilcoxon': for c in cols: try: stat, pval = wilcoxon(df_orig[c], df_impute[c]) pvals = np.append(pvals, pval) except: pvals = np.append(pvals, 0) if test == 'kruskal': for c in cols: stat, pval = kruskal(df_orig[c], df_impute[c], nan_policy='omit') pvals = np.append(pvals, pval) return pvals
def statistic_tests(name_of_file): tab1 = [] tab2 = [] list_of_rows = read_file(name_of_file) fill_tables(tab1, tab2, list_of_rows) print('Rank-Sum') print('ranksum column 1:', rank_sum(tab1), 'column 2:', rank_sum(tab2)) print('Kruskal') print(kruskal(tab1, tab2)) print('ANOVA') print(f_oneway(tab1, tab2)) print('Brunner') print(brunnermunzel(tab1, tab2)) print('Whitney') print(mannwhitneyu(tab1, tab2)) print('Barlet') print(barlet_test(tab1, tab2)) print('Levene') print(levene_test(tab1, tab2)) print('Shapiro') print('shapiro column 1:', shapiro(tab1), 'column 2:', shapiro(tab2)) print('T-Student') print(ttest_ind(tab1, tab2)) print('Lilliefors') print('liliefors', 'column 1:', lilliefors(tab1), 'column 2:', lilliefors(tab2))
def _kruskal_wallis_test(table, response_cols, factor_col, nan_policy='propagate'): result = dict() rb = BrtcReprBuilder() rb.addMD("""## Kruskal Wallis test Result""") groups = dict() for name, group in table.groupby(factor_col): groups[name] = group group_name = [] df = [len(groups) - 1] * len(response_cols) stats = [] pvals = [] for response_col in response_cols: stat, pval = kruskal(*[x[response_col] for x in groups.values()]) group_name.append(response_col + ' by ' + factor_col) stats.append(stat) pvals.append(pval) name = response_col + '_' + factor_col result[name] = dict() result[name]['Statistics'] = stat result[name]['P value'] = pval rb.addMD(strip_margin(""" | {table} """.format(table=pandasDF2MD(pd.DataFrame({'': group_name, 'Degree of Freedom': df, 'Test Statistics': stats, 'P value': pvals}))))) result['_repr_brtc_'] = rb.get() return {'result': result}
def print_KruskalWallisH(div_calc): """ Compute the Kruskal-Wallis H-test for independent samples """ h, p = stats.kruskal(*div_calc) print "Kruskal-Wallis H-test for {} groups:".format(str(len(div_calc))) print "p-value: {}".format(p)
def corr_categorical(Pred, Data_col, alpha, _Gaussian=False): res = 0 Data_col.fillna(value='None', inplace=True) Tags = Data_col.unique() dic_tags = {} for tag in Tags: tag_index = Data_col.index[Data_col == tag].tolist() tag_prices = Pred.ix[tag_index].values dic_tags[tag] = tag_prices if _Gaussian: # Normally ditributed <--- ANOVA print('WIP : Anova test not implemented') # OW_ANOVA = stats.f_oneway(Data_col, Pred) return 0 else: # Otherwise <--- Kruskal Wallis test tuple_arg = ([x for x in list(dic_tags.values())]) kruskal_res = stats.kruskal(*tuple_arg) p_value, H_value = (kruskal_res.pvalue, kruskal_res.statistic) if p_value < alpha: res = 1 else: res = 0 return res
def select_features(self, data, labels): """ Selects interesting features (column indices) from given data matrix using the K-W test This test assumes that the compared groups have the same distribution :param data: MxN matrix containing features as columns, and samples as rows :param labels: Mx1 matrix containing corresponding data labels :return: list of indices of interesting features """ num_features = 5 her2_samples, hr_samples, trip_neg_samples = group_by_classifier(data, labels) p_values = np.zeros((data.shape[1])) for index in range(data.shape[1]): try: p_values[index] = \ stats.kruskal(her2_samples[:, index], hr_samples[:, index], trip_neg_samples[:, index])[1] except ValueError: p_values[index]=1 # Multiple testing correction provide no significant variables, we'll stick with this for now #significant_p_value_indices = np.asarray(np.where(np.array(p_values) < 0.03))[0] significant_p_value_indices = np.asarray(np.argsort(p_values)[0:num_features]) #significant_p_value_indices = np.asarray(np.where(p_values == p_values.min())[0]) return significant_p_value_indices
def summarize_moi(analytical_sets): moi_sets = {} for analytical_set in analytical_sets: moi_sets[analytical_set.label] = calculate_moi( analytical_set.allele_df) # because of the non-normality of the dataset, we will just have to use # rank-based (parametric/catagorical) statistical test stats = {} if len(moi_sets) == 2: # use Mann-Whitney / Wilxocon rank-sum (non-paired) test values = [x.sample_dist['MOI'] for x in moi_sets.values()] stats['test'] = 'Wilcoxon ranksum / Mann-Whitney U-test' stats['stats'] = ranksums(*values) elif len(moi_sets) > 2: # use Kruskal-Wallis values = [x.sample_dist['MOI'] for x in moi_sets.values()] stats['test'] = 'Kruskal-Wallis H-test' stats['stats'] = kruskal(*values) return (moi_sets, stats)
def kruskal_wallis(df, cat_col, num_col, notebook=True): """ Perform kruskal wallis test between the selected columns of the given dataframe. Columns need to be continuous :param df: :param cat_group: :param num_col: :return: """ variables = [] for idx, cat_group in df.groupby(cat_col): # NAN values not included in computation variables.append(cat_group[num_col][cat_group[num_col].notnull()]) kruskal_h, kruskal_p = ss.kruskal(*variables) if print: print(f"H-Value: {kruskal_h}, p-Value: {kruskal_p}") output = f"\tTest: Kruskal-Wallis\n" output += f"\tH-Value: {kruskal_h}, p-Value: {kruskal_p}\n" if kruskal_p <= 0.05: output += "\tSignificance found \n" output += "\tPost-Hoc Tests: Dunns with Bonferonni Correction\n" # Remove nan values selector = df[cat_col].notnull() & df[num_col].notnull() posthoc_data = df[selector] posthoc_result = sp.posthoc_dunn(posthoc_data, num_col, cat_col, p_adjust="bonferroni") if notebook: print(posthoc_result) output += str(posthoc_result) output += "\n" return output
def fit(self, X, y=None): """Learn empirical variances from X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Sample vectors from which to compute variances. y : any Ignored. This parameter exists only for compatibility with sklearn.pipeline.Pipeline. Returns ------- self """ # calculate correlation matrix if isinstance(X, pd.DataFrame): self.correlation_matrix_ = X.corr('pearson') else: X = pd.DataFrame(X) self.correlation_matrix_ = X.corr('pearson') # calculate the order of feature removal if self.score_func == 'f-score': F, pval = f_classif(X, y) index_arr = np.argsort(F)[::-1] self.order = X.columns[index_arr] elif self.score_func == 'h-score': h_stat = list() for col in X.columns: statistic, pvalue = kruskal(X.loc[y, col], X.loc[~y, col]) h_stat.append(statistic) h_stat = np.asarray(h_stat) index_arr = np.argsort(h_stat)[::-1] self.order = X.columns[index_arr] return self
def ttestForTwoChoiceQuestions(xValues, yValues): npArrayX = np.array(xValues) npArrayY = np.array(yValues) # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.normaltest.html#scipy.stats.normaltest xIsNormal = isNormal(npArrayX) yIsNormal = isNormal(npArrayY) if xIsNormal and yIsNormal: # Levene test for equal variances # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.levene.html#scipy.stats.levene l, lp = stats.levene(npArrayX, npArrayY) parametric = xIsNormal and yIsNormal and lp >- 0.05 else: parametric = False if parametric: # if levene test comes out well and samples are normal, can use standard t-test for independent samples # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html#scipy.stats.ttest_ind t, tp = stats.ttest_ind(xValues, yValues, axis=0) else: # if not, use Kruskal-Wallis H-test instead # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kruskal.html#scipy.stats.kruskal t, tp = stats.kruskal(npArrayX, npArrayY) t = t / 5.0 # these come out bigger than the t-test stats return parametric, t, tp
def generate_violion_plots(plot_col, group_col, group_order, ax): boxes = [] mus = [] stds = [] g_order = [] for group in group_order: mask = group_col == group tmp = plot_col[mask].dropna() if len(tmp) > 2: g_order.append(group) boxes.append(tmp.copy().values) mus.append(plot_col[mask].mean()) stds.append(plot_col[mask].std()) if len(boxes) == 2: ef = abs(np.diff(mus))/(np.sum(stds)) ratio = len(boxes[1])/len(boxes[0]) n0 = tt_ind_solve_power(effect_size=ef, alpha = alpha, power = power, ratio = ratio) sizes = [str(int(n0)), str(int(n0*ratio))] _, pval = ttest_ind(*boxes) else: sizes = ['']*len(boxes) _, pval = kruskal(*boxes) labels = ['%s n=%i/%s' % (t, len(b), n) for t, b, n in zip(g_order, boxes, sizes)] violinplot(boxes, ax = ax, labels = labels) return pval, ax
def get_relevance(feat, y_class, relevance_func='mutual_info'): from sklearn.feature_selection import \ chi2, f_classif, mutual_info_classif from scipy.stats import kruskal feat = np.array(feat) if isinstance(relevance_func, str): if relevance_func == 'f_classif': relevance, _ = f_classif(feat, y_class) elif relevance_func == 'chi2': relevance, _ = chi2(feat, y_class) elif relevance_func == 'mutual_info': relevance = mutual_info_classif(feat, y_class) elif relevance_func == 'kruskal': relevance = np.zeros(feat.shape[1]) for i, ft in enumerate(feat.T): try: relevance[i], _ = kruskal( *[ft[y_class == iy] for iy in np.unique(y_class)]) except: relevance[i] = np.nan else: feat = np.array(feat) relevance = np.zeros(feat.shape[1]) for i in range(feat.shape[1]): relevance[i] = relevance_func(feat[:, i], y_class) return relevance
def compare_conc_kruskal(odor): '''Do a kruskal wallis test looking at different concentrations of odor ''' xdf = comp_sorted[['Group', '%s01' % odor, '%s05' % odor, '%s10' % odor]] xctrl = xdf[xdf['Group'] == 'Control'] xMS = xdf[xdf['Group'] == 'Mint'] xHex = xdf[xdf['Group'] == 'Hexanal'] kctrl=kruskal(xctrl['%s01'%odor],xctrl['%s05'%odor],xctrl['%s10'%odor],nan_policy='omit') kmint = kruskal(xMS['%s01' % odor], xMS['%s05' % odor], xMS['%s10' % odor], nan_policy='omit') khex = kruskal(xHex['%s01' % odor], xHex['%s05' % odor], xHex['%s10' % odor], nan_policy='omit') print 'Control group' print kctrl print 'Mint group' print kmint print 'Hexanal group' print khex
def KW_test_diversity(array1, array2, array3=None): """This function performs the Kruskal-Wallis test given at least 2 array. Input: - array1: The first numpy array - array2: The second numpy array - array3: Optional, the third numpy array) Output: - Print the statistical measure with its corresponding P-value""" #If a third array is given if array3 != None: #Perform Kruskal-Wallis test print(stats.kruskal(array1, array2, array3)) #If only 2 arrays are given else: #Perform the Kruskal-Wallis test print(stats.kruskal(array1, array2))
def study_stability(datas, stable_threshold): print( f"[INFO] 0. Analysis of stable features using Kruskal-Wallis test:\n Each features that shows a p-value below {stable_threshold} for stability Kruskal test will be discarded" ) columns = datas["abs64"].columns stable = [] kruskal = {} print( f"[INFO] 0. Analysis of stable features using Kruskal-Wallis test:\n Each features that shows a p-value below {stable_threshold} for stability Kruskal test will be discarded" ) for c in columns: print(f"[RUN] Running analysis for {c}") try: s, p = stats.kruskal(datas["abs64"][c], datas["abs128"][c], datas["abs256"][c]) kruskal[c] = p if p > stable_threshold: stable.append(c) except: pass print( f"[STABILITY] {len(stable)}/{len(columns)} features passed the stability test." ) datas["abs64"] = datas["abs64"][stable] datas["abs128"] = datas["abs128"][stable] datas["abs256"] = datas["abs256"][stable] return datas, stable, kruskal
def calc_p_values(data, gt1_name, gt2_name, stat_colname=None, num_bins=50, bin_how='mean', ): if stat_colname is None: raise ValueError("you must explicitly set stat_colname (try 'maxWingAngle')") data.index = data.index.astype(np.int64) #LAZY DANNO. DROP TIMESTAMPS FOR BINNING. data['synced_ns'] = data.index df_ctrl = data[data.group == gt1_name][['FlyID', stat_colname, 'synced_ns']] df_exp = data[data.group == gt2_name][['FlyID', stat_colname, 'synced_ns']] align_start = df_ctrl.index.min() dalign = df_ctrl.index.max() - align_start p_values = DataFrame() if bin_how=='mean': bin_func = np.mean elif bin_how=='median': bin_func = np.median bins = np.linspace(0,dalign,num_bins+1) + align_start binned_ctrl = pd.cut(df_ctrl.index, bins, labels= bins[:-1]) binned_exp = pd.cut(df_exp.index, bins, labels= bins[:-1]) for x in binned_ctrl.levels: test1_full_dataset = df_ctrl[binned_ctrl == x] test2_full_dataset = df_exp[binned_exp == x] bin_start_time = test1_full_dataset['synced_ns'].min() bin_stop_time = test1_full_dataset['synced_ns'].max() test1 = [] for obj_id, fly_group in test1_full_dataset.groupby('FlyID'): test1.append( bin_func(fly_group[stat_colname].values) ) test1 = np.array(test1) test2 = [] for obj_id, fly_group in test2_full_dataset.groupby('FlyID'): test2.append( bin_func(fly_group[stat_colname].values) ) test2 = np.array(test2) try: hval, pval = kruskal(test1, test2) except ValueError as err: pval = 1.0 dftemp = DataFrame({'Bin_number': x, 'P': pval, 'bin_start_time':bin_start_time, 'bin_stop_time':bin_stop_time, 'name1':gt1_name, 'name2':gt2_name, 'test1_n':len(test1), 'test2_n':len(test2), }, index=[x]) p_values = pd.concat([p_values, dftemp]) return p_values
def kruskal_wallis(data): """ non parametric many samples independent """ H, pval = st.kruskal(*data) return (H, pval)
def kruskalWallisOnTables(tableSet1, tableSet2, idColumn, valueColumn): """ Works as :py:meth:`~emzed.stats.oneWayAnovaOnTables` above, but uses non parametric kruskal wallis test. """ result = _runStatistcsOnTables(tableSet1, tableSet2, idColumn, valueColumn, lambda s1, s2: kruskal(s1, s2)[1]) result.title = "KRUSKAL WALLIS ANALYSIS" return result
def _evalstat(x, bsl, meth, n_perm, metric, maxstat, tail): """Statistical evaluation of features [x] = [xn] = (nFce, npts, nTrials) [bsl] = (nFce, nTrials) """ # Get shape of xF : nf, npts, nt = x.shape pvalues = np.ones((nf, npts)) # Permutations : if meth == 'permutation': perm = perm_swaparray(a, b, n_perm=200, axis=-1, rndstate=0) from brainpipe.xPOO.stats import permutation # Pre-define permutations : pObj = permutation(n_perm) perm = np.zeros((n_perm, nf, npts)) # For each permutation : for p in range(n_perm): # Get 1D iterations : ite = product(range(nf), range(npts)) permT = np.random.permutation(2*nt) for f, pts in ite: bs, xs = bsl[f, :], x[f, pts, :] # Reshape data : subX = np.vstack((bsl[f, :], x[f, pts, :])).reshape(2*nt,) # Shuffle data : subX = subX[permT].reshape(nt, 2) # Normalize data : subX = normalize(subX[:, 0], subX[:, 1], norm=norm) # Get mean of data : perm[p, f, pts] = np.mean(subX) # Get final pvalues : pvalues = pObj.perm2p(np.mean(xn, 2), perm, tail=tail, maxstat=maxstat) # Wilcoxon test : elif meth == 'wilcoxon': from scipy.stats import wilcoxon # Get iterations : ite = product(range(nf), range(npts)) # Compute wilcoxon : for k, i in ite: _, pvalues[k, i] = wilcoxon(x[k, i, :], bsl[k, :]) # Kruskal-Wallis : elif meth == 'kruskal': from scipy.stats import kruskal # Get iterations : ite = product(range(nf), range(npts)) # Compute Kruskal-Wallis : for k, i in ite: _, pvalues[k, i] = kruskal(x[k, i, :], bsl[k, :]) return pvalues
def calc_p_values(_data, stat_colname=None, num_bins=50, bin_how='mean', ): if stat_colname is None: raise ValueError("you must explicitly set stat_colname (try 'maxWingAngle')") _data.index = _data.Time #LAZY DANNO. DROP TIMESTAMPS FOR BINNING. _data = _data.sort('Time') _data['synced_ns'] = _data.index df_baseline = _data[_data['Time'] < 10.0] align_start = _data.Time.min() dalign = int(_data.Time.max()) - int(align_start) dalign = _data.Time.max() - align_start p_values = DataFrame() if bin_how=='mean': bin_func = np.mean elif bin_how=='median': bin_func = np.median bins = np.linspace(0,dalign,num_bins+1) + align_start binned_data = pd.cut(_data.index, bins, labels= bins[:-1]) baseline = df_baseline[stat_colname].values bin_number = 0 for x in binned_data.levels: #test_df = data.loc[(data.index > binned_data.levels[x]) & (data.index <= binned_data.levels[x+1]), stat_colname].values test_df = _data.loc[binned_data == x, stat_colname] bin_start_time = x bin_stop_time = _data.loc[binned_data == x, 'Time'].max() test = np.array(test_df) try: hval, pval = kruskal(baseline, test) except ValueError as err: pval = 1.0 dftemp = DataFrame({'Bin_number': bin_number, 'P': pval, 'bin_start_time':bin_start_time, 'bin_stop_time':bin_stop_time, 'name1':'baseline', 'name2':stat_colname, 'test1_n':len(baseline), 'test2_n':len(test), }, index=[x]) p_values = pd.concat([p_values, dftemp]) bin_number +=1 return p_values
def feature_kw(feature, data): feature_list = [(key, group[feature]) for key, group in data.items()] h, p = stats.kruskal(feature_list[0][1], feature_list[1][1], feature_list[2][1], feature_list[3][1]) print ('Kruskal-Wallace: %s' % feature) print ('=============') for i in feature_list: print ('%s: %.3f +- %.3f' % (i[0], np.median(i[1]) * 1e3, np.std(i[1]) * 1e3)) print ('H value: %.3f' % h) print ('P value: %.5f \n' % p) return feature_list
def testRelationCorrectIncorrect(): P = np.zeros(len(correct)) P_discret = np.zeros(len(correct)) for i in xrange(len(correct)): #KS, p = stats.ks_2samp(correct[i], incorrect[i]) KS, p = stats.kruskal(correct[i], incorrect[i]) P[i] = p P_discret[P < 0.01] = 3 P_discret[(P > 0.01)*(P < 0.05)] = 2 P_discret[(P > 0.05)*(P < 0.1)] = 1 P_discret[P > 0.1] = 0 return P, P_discret
def anova(x, y): grouped = defaultdict(list) [grouped[x_val].append(y_val) for x_val, y_val in zip(x, y)] grouped_values = grouped.values() if len(grouped_values) < 2: return (0, 0, 0, 0) f_oneway_res = list(f_oneway(*grouped_values)) try: kruskal_res = list(kruskal(*grouped_values)) except ValueError: # when all numbers are identical kruskal_res = [0, 0] return f_oneway_res + kruskal_res
def snr(M, list1, list2, threshold = None, significance = False): """ Performs a signal-to-noise ratio test on M, assuming samples are in rows and genes are in columns list1 - List of row indices for first group list2 - List of row indices for second group threshold - Minimum SNR ratio to report significance - Run kruskal ttest (requires scipy) Returns a reverse-ordered list of (ratio, index, mean1, mean2, pvalue) tuples, where index is the column index of the gene, and mean1 and mean2 correspond to the mean for that particular gene in list1 and list2, respectively. pvalue is blank if significance is False. If signifance is true (and scipy is installed) a pvalue will be assigned. Be ware this increases processing time significantly (ha). """ ratios = [] N1 = M.take(tuple(list1), 0) N2 = M.take(tuple(list2), 0) N1mean, N2mean = N1.mean(0), N2.mean(0) means = numpy.abs(N1mean - N2mean) stds = N1.std(0) + N2.std(0) if stds.all(): rats = means / stds else: rats = numpy.zeros((len(means),), dtype=numpy.float32) for i in xrange(len(stds)): if stds[i]: rats[i] = means[i] / stds[i] for i in xrange(M.shape[1]): rat = rats[i] mean1, mean2 = N1mean[i], N2mean[i] if threshold is None or rat >= threshold: if PVAL and significance: pval = st.kruskal(N1[:,i], N2[:,i])[1] else: pval = '' ratios.append( (rat, i, mean1, mean2, pval) ) ratios.sort(reverse=True) return ratios
def kruskal_p(hit_vec, response_vec, min_size=5): ''' Wrapper to do a one way anova on pandas Series ------------------------------------------------ hit_vec: Series of labels response_vec: Series of measurements ''' try: hit_vec, response_vec = match_series(hit_vec, response_vec) return kruskal(*[response_vec[hit_vec == num] for num in hit_vec.unique()])[1] except: return nan
def ANM_causation_score(self,train_size=0.5,independence_criterion='HSIC',metric='linear',regression_method='GP'): ''' Measure how likely a given causal direction is true Parameters ---------- train_size : Fraction of given data used to training phase independence_criterion : kruskal for Kruskal-Wallis H-test, HSIC for Hilbert-Schmidt Independence Criterion metric : linear, sigmoid, rbf, poly kernel function to compute gramm matrix for HSIC gaussian kernel is used in : Nonlinear causal discovery with additive noise models Patrik O. Hoyer et. al Returns ------- causal_strength: A float between 0. and 1. ''' Xtrain, Xtest , Ytrain, Ytest = train_test_split(self.X, self.Y, train_size = train_size) if regression_method == 'GP': _gp = pyGPs.GPR() # specify model (GP regression) _gp.getPosterior(Xtrain, Ytrain) # fit default model (mean zero & rbf kernel) with data _gp.optimize(Xtrain, Ytrain) # optimize hyperparamters (default optimizer: single run minimize) #Forward case #_gp = KernelRidge(kernel='sigmoid',degree=3) #_gp.fit(Xtrain,Ytrain) ym, ys2, fm, fs2, lp = _gp.predict(Xtest) #_gp.plot() #errors_forward = _gp.predict(Xtest) - Ytest errors_forward = ym - Ytest else: _gp = KernelRidge(kernel='sigmoid') _gp.fit(Xtrain, Ytrain) errors_forward = _gp.predict(Xtest) - Ytest #Independence score forward_indep_pval = { 'kruskal': kruskal(errors_forward,Xtest)[1], 'HSIC': self.HilbertSchmidtNormIC(errors_forward,Xtest,metric=metric)[1] }[independence_criterion] return {'causal_strength':forward_indep_pval}
def kruskal_pandas(hit_vec, response_vec, min_size=5): ''' Wrapper to do a one way anova on pandas Series ------------------------------------------------ hit_vec: Series of labels response_vec: Series of measurements ''' try: hit_vec, response_vec = _match_series(hit_vec, response_vec) res = stats.kruskal(*[response_vec[hit_vec == num] for num in hit_vec.unique()]) return pd.Series(res, index=['H','p']) except: return pd.Series(index=['H','p'])
def plot_stats( groupedData, fig_prefix, cutoff='baseline', **kwargs): """ data = output from flymad_jaaba_v6.py (rawdata_**s.pickle), with synced_time column representing seconds, grouped by 'group'. names = list of groups (ex. ['foo','bar','baz']) fig_prefix = full path and filename (without extension) of plot name. **kwargs = """ fig = plt.figure(figsize=(4,3)) ax = fig.add_subplot(111) for GROUP, data in groupedData: colour = colourlist[groupedData.groups.keys().index(GROUP)] pvalue_results = {} if cutoff == 'baseline': ax.set_title('Kruskal Wallis: '+parameter+' vs baseline:', fontsize=12) baseline = data[data.synced_time <=0][parameter].values for time, _data in data[(data.synced_time > 0) & (data.synced_time <= 360)].groupby('synced_time'): pvalue_results[time*args.binsize] = st.kruskal(baseline, _data[parameter])[1] elif cutoff == 'zero': ax.set_title('Kruskal Wallis: '+parameter+' vs zero:', fontsize=12) for time, _data in data[(data.synced_time > 0) & (data.synced_time <= 360)].groupby('synced_time'): pvalue_results[time*args.binsize] = st.ttest_1samp(_data[parameter], 0)[1] pvalue_results = {k: pvalue_results[k] for k in pvalue_results if not isnan(pvalue_results[k])} ax.scatter(pvalue_results.keys(), -np.log10(pvalue_results.values()), label=GROUP, color=colour, linewidth=0) if len(pvalue_results)>=1: n_comparisons = len(pvalue_results) ax.axhline( -np.log10(0.05/n_comparisons), color='k', lw=0.5, linestyle='--' ) ax.set_xlim(0,360) ax.set_ylim(0,8)#1.1*max(-np.log10(pvalue_results.values()))) ax.set_xlabel('Time (s)', fontsize=12) ax.set_ylabel('-Log10(P)', fontsize=12) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.yaxis.set_ticks_position('left') ax.xaxis.set_ticks_position('bottom') l = plt.legend() l.set_zorder(1000) plt.tick_params(axis='both', which='major', labelsize=12) plt.tight_layout() for ext in ['.png','.svg']: fig_fname = fig_prefix + '_'+parameter + ext fig.savefig(fig_fname, bbox='tight') print 'saved',fig_fname return pvalue_results
def print_KruskalWallisH(div_calc): """ Compute the Kruskal-Wallis H-test for independent samples. A typical rule is that each group must have at least 5 measurements. """ calc = defaultdict(list) try: for k1, v1 in div_calc.iteritems(): for k2, v2 in v1.iteritems(): calc[k1].append(v2) except: return "Error setting up input arrays for Kruskal-Wallis H-Test. Skipping "\ "significance testing." h, p = stats.kruskal(*calc.values()) print "\nKruskal-Wallis H-test statistic for {} groups: {}".format(str(len(div_calc)), h) print "p-value: {}".format(p)
def show_drinking_behavior(basepath=None,compare_distributions=True, visualize_one_random_actor=False, visualize_all_actors=True): agents = np.loadtxt(os.path.join(basepath,'responders'),delimiter=TAB) filename = os.path.join(basepath,'drinking-behavior.txt') drinking_behavior = np.loadtxt(filename,delimiter=TAB) if compare_distributions: fig = plt.figure() ax = fig.add_subplot(111) H,p = kruskal(drinking_behavior[:,INITIAL],drinking_behavior[:,END]) initial_distribution = drinking_behavior[:,INITIAL] final_distribution = drinking_behavior[:,END] low = min(initial_distribution.min(),final_distribution.min()) high = max(initial_distribution.max(),final_distribution.max()) ax.hist(initial_distribution,color='r',alpha=0.5,bins=20,label='Initial',range=(low,high)) ax.hist(final_distribution,color='k',alpha=0.5,bins=20,label='Final', range=(low,high)) artist.adjust_spines(ax) ax.set_xlabel(artist.format('Intent to drink')) ax.set_ylabel(artist.format('Prevalence')) plt.legend(frameon=False) filename = os.path.join(os.getcwd(),basepath,'drinking-behavior-compare-distributions.png') plt.savefig(filename,dpi=300) if visualize_one_random_actor: fig = plt.figure() ax = fig.add_subplot(111) random_actor = random.choice(xrange(drinking_behavior.shape[0])) ax.plot(drinking_behavior[random_actor,:],'k--',linewidth=2) artist.adjust_spines(ax) ax.set_ylabel(artist.format('Past drinking behavior')) ax.set_xlabel(artist.format('Time')) filename = os.path.join(os.getcwd(),basepath,'drinking-behavior-visualize-actor.png') plt.savefig(filename,dpi=300) if visualize_all_actors: fig = plt.figure() ax = fig.add_subplot(111) cax = ax.imshow(drinking_behavior,interpolation='nearest',aspect='auto') artist.adjust_spines(ax) ax.set_ylabel(artist.format('Actor')) ax.set_xlabel(artist.format('Time')) plt.colorbar(cax) filename = os.path.join(os.getcwd(),basepath,'drinking-behavior-visualize-all-actors.png') plt.savefig(filename,dpi=300)
def stats_pairwise(_dataset, _column, _within, _between): fulldf = pd.DataFrame() for grp in list(set(_dataset[_within])): df = _dataset[_dataset[_within] == grp] g = df.groupby(_between) groups = list(g.groups) data = [col for col_name, col in g[_column]] datanames = [col_name for col_name, col in g[_column]] #pairs = get_pairs(g.groups) p_vals = [] pairs = [] for pair in get_pairs(range(len(data))): T, P = ss.kruskal(data[pair[0]], data[pair[1]]) p_vals.append(P) pairs.append((datanames[pair[0]], datanames[pair[1]])) tempdf = pd.DataFrame({'within':grp, 'between':pairs, 'measure':_column, 'p':p_vals}) fulldf = pd.concat([fulldf,tempdf], axis=0) return fulldf
def calc_kruskal(x, sample_num_l, alpha): tmp_input_l = split_list(x[1:],sample_num_l) #ignore id column try: h,p = stats.kruskal(*tmp_input_l) #run kruskal-wallist test # h,p = stats.f_oneway(*tmp_input_l) except ValueError: return x+['1.00','0'] if math.isnan(p) : return x+['1.00','0'] result = [] if p < alpha : num = len(sample_num_l) pval_l = [] for i in range(num-1): for j in range(i+1, num): tmp_p = 0.0 try: tmp_u, tmp_p = stats.mannwhitneyu(tmp_input_l[i],tmp_input_l[j]) #This is one-sied result except ValueError : tmp_p = 0.5 pval_l.append(tmp_p*2) rej = smm.multipletests(pval_l, alpha=alpha, method='fdr_bh')[0] # fdr correction flag = 1 for i in range(len(rej)): if ~rej[i] : flag = 0 break result = [`p`,`flag`] else: result = [`p`,'0'] return x+result
def KruskalWallis(data): '''Non-parametric comparison between the groups''' print('\n Kruskal-Wallis test ----------------------------------------------------') # First, I get the values from the dataframe g_a = data['weight'][data['group']=='TreatmentA'] g_b = data['weight'][data['group']=='TreatmentB'] g_c = data['weight'][data['group']=='Control'] #Note: this could also be accomplished with the "groupby" function from pandas #groups = pd.groupby(data, 'group') #g_a = groups.get_group('TreatmentA').values[:,1] #g_c = groups.get_group('Control').values[:,1] #g_b = groups.get_group('TreatmentB').values[:,1] # Then do the Kruskal-Wallis test h, p = stats.kruskal(g_c, g_a, g_b) print('Result from Kruskal-Wallis test: p = {0}'.format(p))
def KruskalWallis(data): """Non-parametric comparison between the groups""" print("\n Kruskal-Wallis test ----------------------------------------------------") # First, I get the values from the dataframe g_a = data["weight"][data["group"] == "TreatmentA"] g_b = data["weight"][data["group"] == "TreatmentB"] g_c = data["weight"][data["group"] == "Control"] # Note: this could also be accomplished with the "groupby" function from pandas # groups = pd.groupby(data, 'group') # g_a = groups.get_group('TreatmentA').values[:,1] # g_c = groups.get_group('Control').values[:,1] # g_b = groups.get_group('TreatmentB').values[:,1] # Then do the Kruskal-Wallis test h, p = stats.kruskal(g_c, g_a, g_b) print("Result from Kruskal-Wallis test: p = {0}".format(p))
def compare_feature_groups(fg1,fg2,variance=False,name='Comparison'): ttest = stats.ttest_ind(fg1,fg2,equal_var = variance) ktest = stats.kruskal(fg1,fg2) rktest = stats.ranksums(fg1,fg2) temp = ''' Stats Comparsion [{1}] ---------------------------------------------- Tests | P-Value ---------------------------------------------- Student-T | {0} Kruskal | {2} RankSum | {3} ''' print temp.format(ttest[1],name,ktest[1],rktest[1]) return ttest[1] > 0.05
def non_para(data,var,cat,method='Wilcoxon'): """Do non-parametric test comparing values for a given variable (specified by argument 'var') between data grouped by a given category (specified by argument 'cat'); data can be a pandas DataFrame or a dictionary. There are two method options: Wilcoxon and Kruskal. Two matrices are returned, the first one containing p-value of the test (and therefore is symmetric), the second containing difference between median of the two categories (row minus column).""" cats = list(set(data[cat])) p_value = np.zeros((len(cats),len(cats))) diff = np.zeros((len(cats),len(cats))) for i1 in range(len(cats)): for i2 in range(len(cats)): if method=='Wilcoxon': p_value[i1,i2] = round(stats.ranksums(data[var][data[cat]==cats[i1]],data[var][data[cat]==cats[i2]])[1],3) elif method=='Kruskal': p_value[i1,i2] = round(stats.kruskal(data[var][data[cat]==cats[i1]],data[var][data[cat]==cats[i2]])[1],3) else: print 'No such method' return diff[i1,i2] = data[var][data[cat]==cats[i1]].median()-data[var][data[cat]==cats[i2]].median() p_value = pd.DataFrame(p_value,index=cats,columns=cats) diff = pd.DataFrame(diff,index=cats,columns=cats) result = {'p':p_value,'med_diff':diff} return result