def anova(dataname, nparray1, nparray2): if nparray1.ndim > 1: H, pval = mstats.kruskalwallis(np.nanmean(nparray1, axis=1), np.nanmean(nparray2, axis=1)) print "anova: ", dataname, ': Mean of array wt, mut, H-stat, P-value: ', str(np.nanmean(np.nanmean(nparray1,axis=1))), str(np.nanmean(np.nanmean(nparray2,axis=1))), str(H), str(pval) else: H, pval = mstats.kruskalwallis(nparray1, nparray2) print "anova: ", dataname, ': Mean of array wt, mut, H-stat, P-Value: ', str(np.nanmean(np.nanmean(nparray1))), str(np.nanmean(np.nanmean(nparray2))), str(H), str(pval)
def kruskal_wallis(field, coerce=None): h1, p1 = kruskalwallis(*group_by_sentiment(field, coerce).values()) h2, p2 = kruskalwallis(*group_by_nps(field, coerce).values()) print('\nKruskal-Wallis H-Test on %s:' % field) print(' - When grouped by ordinal value:') print(' - %s' % singificant(p1)) print(' - H = %s' % h1) print(' - p = %s' % p1) print(' - When grouped by Net Promoter Score:') print(' - %s' % singificant(p2)) print(' - H = %s' % h2) print(' - p = %s' % p2)
def anova(self, min_mean_expr=None): """ carry out non-parametric ANOVA across the groups of self. :param min_mean_expr: minimum average gene expression value that must be reached in at least one cluster for the gene to be considered :return: """ if self._anova is not None: return self._anova # run anova f = lambda v: kruskalwallis(*np.split(v, self.split_indices))[1] pvals = np.apply_along_axis( f, 0, self.data) # todo could shunt to a multiprocessing pool # correct the pvals _, pval_corrected, _, _ = multipletests(pvals, self.alpha, method='fdr_tsbh') # store data & return if self.index is not None: self._anova = pd.Series(pval_corrected, index=self.index) else: self._anova = pval_corrected return self._anova
def run(self): matrix = self.dataset.matrix.transpose() #we want to compare the genes p_values = [] h_statistics = [] classes = np.unique(self.dataset.labels) if len(classes) != 3: raise Exception("This implementation is for 3 classes.") for line in matrix.tolist(): #devide gene's values into 2 classes (samples) sample1 = [ line[i] for i in range(len(line)) if self.dataset.labels[i] == classes[0] ] sample2 = [ line[i] for i in range(len(line)) if self.dataset.labels[i] == classes[1] ] sample3 = [ line[i] for i in range(len(line)) if self.dataset.labels[i] == classes[2] ] h, p_value = mstats.kruskalwallis(np.array(sample1), np.array(sample2), np.array(sample3)) p_values.append(p_value) h_statistics.append(h) return h_statistics, p_values
def plot_var_dist(plotargs, kkey, kw_xy=(20,20), color="muted"): f, ax = plt.subplots(1,1, figsize=(12, 4), sharex=True) cpalette = sb.color_palette(color) for arg in plotargs: df, label, color_num = arg color = cpalette[color_num] # Summary stats: mean = df[kkey].mean() med = df[kkey].median() std = df[kkey].std() skew = df[kkey].skew() stat = u"\nμ={:0.2f} med={:0.2f}\nσ={:0.2f} N={}".format( mean, med, std, len(df)) label += stat yvals, xvals, patchs = plt.hist(df[kkey].tolist(), bins=100, label=label, color=color, alpha=0.6, histtype='stepfilled') H, prob = kruskalwallis(*[x[0][kkey] for x in plotargs]) # U, prob = mannwhitneyu(*[x[0][kkey] for x in plotargs]) ax.annotate("Kruskal-Wallis:\nH={H:.2f}\nprob={p:.3f}".format(H=H, p=prob), xy=(kw_xy[0], kw_xy[1])) plt.ylabel("Frequency") plt.legend()
def sb_distplots(plotargs, return_key='close_return', update_type='Revisions'): "Plots conditional underpricing distributions. Run set_data(df) first." f, ax = plt.subplots(1,1,figsize=(16, 5), sharex=True) for arg in plotargs: df, c, l, h = arg sb.distplot(df[return_key], ax=ax, kde_kws={"label": l + " Obs={N}".format(N=len(df)), "color": c}, hist_kws={"histtype": "stepfilled", "color": c}) r = df[return_key] m,s,y,med = r.mean(), r.std(), r.skew(), r.median() ax.annotate( u'μ={:.2f}%, σ={:.2f}, γ={:.2f}'.format(m,s,y), xy=(med+2, h), xytext=(med+6, h+0.01), arrowprops=dict(facecolor=cl.rgb2hex(c), width=1.5, headwidth=5, shrink=0.1)) H, prob = kruskalwallis(*[x[0][return_key] for x in plotargs]) ax.annotate("Kruskal-Wallis: (H={H:.2f}, prob={p:.3f})".format(H=H, p=prob), xy=(66,0.01)) plt.title("Conditional Underpricing Distributions %s" % update_type) plt.ylabel("Density") plt.xlim(xmin=-40,xmax=100) plt.xlabel("1st Day Returns (%)") plt.ylim((0, 0.12))
def kwallis(df, x_col=None, y_col=None, *args, **kwargs): if len(df) < 1: return 0 if x_col is not None: x = df[x_col].values else: x = df.x if y_col is not None: y = df[y_col] else: y = df.y try: return float(kruskalwallis(x, y)[0]) except Exception as e: print kruskalwallis(x, y) raise e
def check_kw(resid_4d): """ Kruskal-Wallis tests the null hypothesis that the population median of all of the groups are equal. In particular, this function performs a Kruskal-Wallis test for each voxel's residuals against a sample from the normal distribution. Parameters --------- resid_4d: residual data of 4D numpy array Returns ------- kw_normality: 3D array of p-values. """ kw_3d = np.zeros(resid_4d.shape[:-1]) for i in range(resid_4d.shape[0]): for j in range(resid_4d.shape[1]): for k in range(resid_4d.shape[2]): norm_samp = np.random.normal(np.mean(resid_4d[i, j, k, :]), np.std(resid_4d[i, j, k, :]), resid_4d.shape[-1]) junk, kw_3d[i, j, k] = kruskalwallis(resid_4d[i, j, k, :], norm_samp) return kw_3d
def anova(dataname, nparray1, nparray2): if nparray1.ndim > 1: nanmean1 = np.nanmean(np.nanmean(nparray1, axis=1)) #print("nanamean1: "+str(nanmean1)) nanvar1 = np.nanvar(np.nanmean(nparray1, axis=1)) nanmean2 = np.nanmean(np.nanmean(nparray2, axis=1)) #print("nanamean2: "+str(nanmean2)) nanvar2 = np.nanvar(np.nanmean(nparray2, axis=1)) H, pval = mstats.kruskalwallis(np.nanmean(nparray1, axis=1), np.nanmean(nparray2, axis=1)) print("anova: ", dataname, ': N of control, test, Mean of array control, test, Variance of array control, test, SSMD, H-stat, P-value: ', len(np.nanmean(nparray1, axis=1)),len(np.nanmean(nparray2, axis=1)), str(nanmean1), str(nanmean2), str(nanvar1), str(nanvar2), str((nanmean1 - nanmean2) / math.sqrt(nanvar1+nanvar2)), str(H), str(pval)) else: nanmean1 = np.nanmean(np.nanmean(nparray1)) nanvar1 = np.nanvar(nparray1) nanmean2 = np.nanmean(np.nanmean(nparray2)) nanvar2 = np.nanvar(nparray1) H, pval = mstats.kruskalwallis(nparray1, nparray2) print("anova: ", dataname, ': N of control, test, Mean of array control, test, Variance of array control, test, SSMD, H-stat, P-value: ', len(np.nanmean(nparray1)), len(np.nanmean(nparray2)), str(nanmean1), str(nanmean2), str(nanvar1), str(nanvar2), str((nanmean1 - nanmean2) / math.sqrt(nanvar1+nanvar2)), str(H), str(pval))
def kruskal(df, alpha=0.05): num_df = df.select_dtypes(include=np.number) kruskal_pvalues = np.empty(len(num_df.columns)) for ind, col in enumerate(num_df.columns): test = kruskalwallis( *[group[col].values for name, group in df.groupby("ACTIVITY")]) kruskal_pvalues[ind] = test.pvalue return num_df.columns[kruskal_pvalues > alpha].values
def sig_relationship_kruskalwallis(self, frame): factors = frame.dimension.unique().tolist() data_sets = list() for factor in factors: data_sets.append(frame.ix[frame.dimension == factor, 'value']) if len(data_sets) < 2: return 1 else: return kruskalwallis(*data_sets)[1]
def test(matrix, columns=-1): if columns == -1: columns = range(0, matrix.shape[1]) ##Make the matrix understandable for the test function aux = [] for i in columns: aux.append(matrix[:, i]) return (kruskalwallis(aux))
def compare_groups(df, group0, group1, group0_name='group0', group1_name='group1'): """ Calculate log2 fold change and tests for statistical difference (Kruskal-Wallis + FDR correction) :param <pd.DataFrame>: Table with normalized read counts for each GO_ID and sample :param group0 <list>: Samples of group 0 :param group1 <list>: Samples of group 1 :return <pd.DataFrame>: Table with added statistics """ dict_results = { GOID: { 'pval': 0.0, 'log2fc': 0.0, 'mean_{}_tpm'.format(group0_name): 0.0, 'mean_{}_tpm'.format(group1_name): 0.0, 'padj': 0.0 } for GOID in df.index } for GOID in df.index: GO_group0 = np.array(df.loc[GOID, group0]) GO_group1 = np.array(df.loc[GOID, group1]) try: H, pval = mstats.kruskalwallis(GO_group0, GO_group1) except: pval = np.nan dict_results[GOID]['pval'] = pval mean_group0 = np.mean(GO_group0) mean_group1 = np.mean(GO_group1) dict_results[GOID]['mean_{}_tpm'.format(group0_name)] = mean_group0 dict_results[GOID]['mean_{}_tpm'.format(group1_name)] = mean_group1 try: log2fc = np.log2(mean_group0 / mean_group1) except: log2fc = np.nan dict_results[GOID]['log2fc'] = log2fc df_results = pd.DataFrame.from_dict(dict_results, orient='index') df_results = df_results.replace([np.inf, -np.inf], np.nan) df_results = df_results.fillna(0.0) df_results = do_FDR_correction(df_results) return df_results
def test_signficance_of_relationship(monkeypatch): monkeypatch.setattr(Processor, 'dimension_value_frame', mock_dim_value_frame) mock_data = mock_dim_value_frame(None, None, None) exp_arr_1 = mock_data.ix[mock_data.dimension == 'D1', 'value'] exp_arr_2 = mock_data.ix[mock_data.dimension == 'D2', 'value'] exp = kruskalwallis(exp_arr_1, exp_arr_2)[1] p = Processor() a = Analyzer(processor=p) p_val = a.significance_of_relationship('D1', 'Q1', 'kruskalwallis') assert p_val == exp
def kw_test(data): """ data = [data2d_1,..,data2d_n] """ n_pos = data[0].shape[1] p_values = np.zeros((n_pos,)) for pos in range(n_pos): samples = [ data2d[:, pos] for data2d in data ] h, p_values[pos] = kruskalwallis(*samples) return p_values
def scores(mmax, mbin, mtypes, mtype, m): for n in range(0, mmax): a = mtypes[0][n:n + mbin] b = array(mtypes[1][n:n + mbin]) c = array(mtypes[2][n:n + mbin]) d = array(mtypes[3][n:n + mbin]) hstat, pval = s.kruskalwallis(a, b, c, d) if n < 100: print "{0}\t{1}".format(sorted(m[mtype])[n][0], pval) print "{0}\t{1}".format(sorted(m[mtype])[n][0] + 10, pval) elif n > 199: print "{0}\t{1}".format(sorted(m[mtype])[n][0] - 100, pval) print "{0}\t{1}".format(sorted(m[mtype])[n][0] - 90, pval) else: print "{0}\t{1}".format(sorted(m[mtype])[n][0], pval)
def non_par_test(data, clust_members): for i in range(data.shape[0] - 1): [test, p] = sci.kruskalwallis( list(data[clust_members['Cluster 0']].ix[i + 1]) + list(data[clust_members['Cluster 1']].ix[i + 1]), list(data[clust_members['Cluster 2']].ix[i + 1]), list(data[clust_members['Cluster 3']].ix[i + 1])) #list(data[clust_members['Cluster 4']].ix[i+1]), #list(data[clust_members['Cluster 5']].ix[i+1]), #list(data[clust_members['Cluster 6']].ix[i+1])) if p < 0.05: print('Lag: ' + str(i) + ' Significant') else: print('Lag: ' + str(i) + ' ---***---')
def kruskalwallis_analysis(mvalues, fnames, fvalues): stats = [] for fname, frow in zip(fnames, fvalues): try: lists = shatter(mvalues, frow) summary = {k: "%.4g" % (np.mean(v)) for k, v in lists.items()} summary = [":".join([k, v]) for k, v in summary.items()] summary = "|".join(summary) hstat, p = kruskalwallis(*lists.values()) stats.append([fname, summary, p]) except: sys.stderr.write( "NOTE: Unable to compute Kruskal-Wallis with feature: " + fname + "\n") return adjust_stats(stats)
def kruskalTest(november): Col_1 = np.concatenate(november.select('rain_intensity').collect(), axis=0) print(Col_1) Col_2 = np.concatenate(november.select('internet_level').collect(), axis=0) print("Kruskal Wallis H-test test:") H, pval = mstats.kruskalwallis(Col_1, Col_2) print("H-statistic:", H) print("P-Value:", pval) if pval < 0.05: print( "Reject NULL hypothesis - Significant differences exist between groups." ) if pval > 0.05: print( "Accept NULL hypothesis - No significant difference between groups." )
def kruskalWallis(df, alpha): print(" Kruskal Wallis H-test test:") h = list(df.columns.values) for column in h[:-1]: # get the H and pval H, pval = mstats.kruskalwallis(df[column].tolist(), df["quality"].tolist()) print " H-statistic:", H print " P-Value:", pval #check pvalue if pval < alpha: print "Reject NULL hypothesis - Significant differences exist between ", column, " and quality \n\n" if pval >= alpha: print "Accept NULL hypothesis - No significant difference between ", column, " and quality \n\n"
def main(): # Get the data city1 = array([68, 93, 123, 83, 108, 122]) city2 = array([119, 116, 101, 103, 113, 84]) city3 = array([70, 68, 54, 73, 81, 68]) city4 = array([61, 54, 59, 67, 59, 70]) # Perform the Kruskal-Wallis test h, p = kruskalwallis(city1, city2, city3, city4) # Print the results if p<0.05: print('There is a significant difference between the cities.') else: print('No significant difference between the cities.') return h
def get_test_kw_inner_text_length_y(data): # Test non parametric Kruskal-Wallis between inner_text_length and Y title = data[data['y'].apply( lambda x: True if x in "CEML__TITLE" else False)].inner_text_length price = data[data['y'].apply( lambda x: True if x in "CEML__PRICE" else False)].inner_text_length desc = data[data['y'].apply(lambda x: True if x in "CEML__DESCRIPTION" else False)].inner_text_length list = data[data['y'].apply(lambda x: True if x in "CEML__PAGE__DESCRIPTION__LIST__ITEMS" else False)].inner_text_length noisy = data[data['y'].apply(lambda x: True if x in "NOISY" else False)].inner_text_length sample_size = round(len(noisy) / 2) title = np.random.choice(title, sample_size) desc = np.random.choice(desc, sample_size) list = np.random.choice(list, sample_size) price = np.random.choice(price, sample_size) noisy = np.random.choice(noisy, sample_size) M = np.transpose(np.array([title, price, desc, list, noisy])) M = pd.DataFrame(M, columns=[ 'CEML__TITLE', 'CEML__PRICE', 'CEML__DESCRIPTION', 'CEML__PAGE__DESCRIPTION__LIST__ITEMS', 'NOISY' ]) H, pval = mstats.kruskalwallis( M['CEML__TITLE'].tolist(), M['CEML__PRICE'].tolist(), M['CEML__DESCRIPTION'].tolist(), M['CEML__PAGE__DESCRIPTION__LIST__ITEMS'].tolist(), M['NOISY'].tolist()) print("Test Kruskal-Wallis for inner_text_length grouped by y") print("H-statistic:", H) print("P-Value:", pval) if pval < 0.05: print( "Reject NULL hypothesis - Significant differences exist between groups." ) if pval > 0.05: print( "Accept NULL hypothesis - No significant difference between groups." ) return data
def main(): # Get the data city1 = array([68, 93, 123, 83, 108, 122]) city2 = array([119, 116, 101, 103, 113, 84]) city3 = array([70, 68, 54, 73, 81, 68]) city4 = array([61, 54, 59, 67, 59, 70]) # Perform the Kruskal-Wallis test h, p = kruskalwallis(city1, city2, city3, city4) # Print the results if p < 0.05: print('There is a significant difference between the cities.') else: print('No significant difference between the cities.') return h
def run_correlation(df, feature, outcome): # print("FEATURE",feature,"OUTCOME",outcome) # print(len(df.index)) P_SIGNIFICANT = .05 outcomes = set(df[outcome].tolist()) n_outcomes = len(outcomes) # print("N OUTCOMES",n_outcomes) groups = [] for oc in outcomes: groups += [df[df[outcome] == oc][feature].tolist()] are_norm = True for g in groups: # print(g,len(g)) (s, p) = mstats.normaltest(g) are_norm = are_norm and (p > P_SIGNIFICANT) result = {} if are_norm: if n_outcomes <= 2: (s, p) = stats.ttest_ind(groups[0], groups[1]) result['test'] = 't-test' else: (s, p) = stats.f_oneway(*groups) result['test'] = 'One-way ANOVA' result['statistic'] = s result['p'] = p for (n, g) in zip(range(len(groups)), groups): result['mean_%d' % n] = np.mean(g) else: if n_outcomes <= 2: (s, p) = stats.mannwhitneyu(groups[0], groups[1]) result['test'] = 'Mann-Whitney' else: # print(len(groups),len(groups[0])) (s, p) = mstats.kruskalwallis(*groups) result['test'] = 'Kruskal-Wallis' result['statistic'] = s result['p'] = p for (n, g) in zip(range(len(groups)), groups): result['mean_%d' % n] = np.mean(g) return result
def KWtest(Matrixs, Words, WordLists, option="CustomP", Low=0.0, High=1.0): # begin handle options MergeList = merge_list(WordLists) TotalWordCount = sum(MergeList.values()) NumWord = len(MergeList) High, Low = wordfilter(option, Low, High, NumWord, TotalWordCount, MergeList) # end handle options Len = max(len(matrix) for matrix in Matrixs) # the length of all the sample set (all the sample set with less that this will turn into a masked array) word_pvalue_dict = {} # the result list for i in range(1, len(Matrixs[0][0])): # focusing on a specific word word = Words[i - 1] if Low < MergeList[word] < High: samples = [] for k in range(len(Matrixs)): # focusing on a group sample = [] for j in range(len(Matrixs[k])): # focusing on all the segment of that group # add the sample into the sample list sample.append(Matrixs[k][j][i]) # combine all the samples of each sample list # turn the short ones masked so that all the sample set has the same length samples.append( ma.masked_array( sample + [0] * (Len - len(sample)), mask=[0] * len(sample) + [1] * (Len - len(sample)) ) ) # do the KW test try: pvalue = kruskalwallis(samples)[1] except ValueError as error: if error.args[0] == "All numbers are identical in kruskal": # get the argument of the error pvalue = "Invalid" else: raise ValueError(error) # put the result in the dict word_pvalue_dict.update({word: pvalue}) return sorted(word_pvalue_dict.items(), key=itemgetter(1))
def kruskal_wallis(norm_df, metadata, groups): """ performes the kruskal wallis test and corrects the obtained p-values using Benjamini Hochberg FDR correction -------- norm_df dataframe, normalized GCs metadata dict, {sample id: metadata} groups list, names of the inputted groups returns -------- fdr_df = dataframe, contains the adjusted P-values for the GCs """ gc_groups = {} p_values = [] group1 = groups[0] group2 = groups[1] row = pd.Series(metadata, name="Metadata") df = norm_df.append(row).sort_values(by=["Metadata"], axis=1) df = df.replace(0, float(0.0)).T df = df.loc[df["Metadata"].isin(groups)] for gc_name in df.columns: if "GC_DNA--" in gc_name: # filter out the housekeeping genes gc_groups[gc_name] = {} for grp in df['Metadata'].unique(): # make arrays of the groups per GC {GC: {group1: array, group2: array}} gc_groups[gc_name][grp] = df[gc_name][df['Metadata'] == grp].values # perform Kruskal Wallis test for gc in gc_groups.keys(): no, pval = mstats.kruskalwallis(gc_groups[gc][group1], gc_groups[gc][group2]) p_values.append(pval) fdr = fdrcorrection(p_values, alpha=0.05, method="i") fdr_df = pd.DataFrame(data=fdr, columns=gc_groups.keys(), index=["T/F", "pval"]).T return fdr_df
def main(): '''These data could be a comparison of the smog levels in four different cities. ''' # Get the data city1 = np.array([68, 93, 123, 83, 108, 122]) city2 = np.array([119, 116, 101, 103, 113, 84]) city3 = np.array([70, 68, 54, 73, 81, 68]) city4 = np.array([61, 54, 59, 67, 59, 70]) # --- >>> START stats <<< --- # Perform the Kruskal-Wallis test h, p = kruskalwallis(city1, city2, city3, city4) # --- >>> STOP stats <<< --- # Print the results if p<0.05: print('There is a significant difference between the cities.') else: print('No significant difference between the cities.') return h
def do_kruskal(self, region, depth, year, path, prefix): """ apply a kruskal wallis for a given year, region and depth """ # get file name for the clustered file name_clst = path + prefix + '_gaModel_' + region + '_' + str(depth) + '_' + str(year) + '.txt' # get file name for the not declustered file name_no_clst = path + 'gaModel_' + region + '_' + str(depth) + '_' + str(year) + '.txt' # get file pointer for the clustered_file fp_clst = open(name_clst, 'r') # get file pointer for the clustered_file fp_no_clst = open(name_no_clst, 'r') # get list of fitness of individuals for clustering fit_clst = [] for line in fp_clst: fit_clst.append(float(line)) # get list of fitness of individuals for clustering fit_no_clst = [] for line in fp_no_clst: fit_no_clst.append(float(line)) # do kruskal wallis test try: result = kruskalwallis(fit_no_clst, fit_clst) p_value = result[1] except ValueError: print("Todos os numeros do teste de Kruskal sao iguais, a funcao retorna um erro") p_value = None print("Mean of the undeclustered: " + str(statistics.mean(fit_no_clst))) print("Mean of the declustered: " + str(statistics.mean(fit_clst))) print("p-value for the kruskal-wallis: " + str(p_value)) # close files fp_clst.close() fp_no_clst.close()
def KruskalTest(Type='NbComments'): if Type == 'NbComments': Groups, NbComments = Luxury_vs_NonLuxury(False) df = pd.DataFrame({'Groups': Groups, 'NbComments': NbComments}) df['Groups'].replace({'Luxary': 1, 'NonLuxuary': 2}, inplace=True) Col_1 = df['NbComments'].tolist() Col_2 = df['Groups'].tolist() else: Groups, NbComments, Sentiments = Luxury_vs_NonLuxury(True) SGroups = [] for i in range(0, len(Groups)): SGroups.extend(repeat(Groups[i], len(Sentiments[i]))) GSentiments = [ float(item) for sublist in Sentiments for item in sublist ] df = pd.DataFrame({'Groups': SGroups, 'Sentiments': GSentiments}) df['Groups'].replace({'Luxary': 1, 'NonLuxuary': 2}, inplace=True) Col_1 = df['Sentiments'].tolist() Col_2 = df['Groups'].tolist() print("Kruskal Wallis H-test " + Type + " test:") H, pval = mstats.kruskalwallis(Col_1, Col_2) print("H-statistic:", H) print("P-Value:", pval) if pval < 0.05: print( "Reject NULL hypothesis - Significant differences exist between groups." ) if pval > 0.05: print( "Accept NULL hypothesis - No significant difference between groups." ) return df
def kw_test_for_means(current_climate = True, data_folder = 'data/streamflows/hydrosheds_euler9', months = list(range(1,13))): """ returns p-values resulting from kruskal - wallis test on annual means """ the_ids = members.all_current if current_climate else members.all_future file_paths = [] for the_file in os.listdir(data_folder): if the_file.split("_")[0] in the_ids: file_paths.append(os.path.join(data_folder, the_file)) real_means = [] for the_path in file_paths: streamflow, times, i_indices, j_indices = data_select.get_data_from_file(the_path) #for each year and for each gridcell get mean value for the period means_dict = data_select.get_means_over_months_for_each_year(times, streamflow, months = months) means_sorted_in_time = [x[1] for x in sorted(list(means_dict.items()), key=lambda x: x[0])] data_matrix = np.array(means_sorted_in_time) real_means.append(data_matrix) #save modelled means #print "data_matrix.shape = ", data_matrix.shape n_positions = real_means[0].shape[1] p_values = np.zeros((n_positions,)) for pos in range(n_positions): samples = [ data2d[:, pos] for data2d in real_means ] #x = list(samples) #print len(x), x[0].shape h, p_values[pos] = kruskalwallis(*samples) return p_values pass
def plot_var_dist(plotargs, kkey='IPO_duration', kw_xy=(20,20)): f, ax = plt.subplots(1,1, figsize=(12, 4), sharex=True) for arg in plotargs: df, label, color, xshift, yshift = arg color = sb.color_palette("muted")[color] label += " Obs={}".format(len(df)) # Summary stats: mean = df[kkey].mean() mode = df[kkey].mode() med = df[kkey].median() std = df[kkey].std() skew = df[kkey].skew() stat = u"\nμ={:0.2f} med={:0.2f}\nσ={:0.2f} skew={:0.2f}".format( mean, med, std, skew) yvals, xvals, patchs = plt.hist(df[kkey].tolist(), bins=36, label=label, color=color, alpha=0.6, histtype='stepfilled') coords = list(zip(yvals,xvals)) coords.sort() y,x = coords[-3] ax.annotate(stat, xy=(x, y), xytext=(x*xshift, y*yshift), arrowprops=dict(facecolor=color, width=1.6, headwidth=1.6)) H, prob = kruskalwallis(*[x[0][kkey] for x in plotargs]) # U, prob = mannwhitneyu(*[x[0][kkey] for x in plotargs]) ax.annotate("Kruskal-Wallis: (H={H:.2f}, prob={p:.3f})".format(H=H, p=prob), xy=(kw_xy[0], kw_xy[1])) plt.ylabel("Frequency") plt.legend()
def check_kw(resid_4d): """ Kruskal-Wallis tests the null hypothesis that the population median of all of the groups are equal. In particular, this function performs a Kruskal-Wallis test for each voxel's residuals against a sample from the normal distribution. Parameters --------- resid_4d: residual data of 4D numpy array Returns ------- kw_normality: p-value from Kruskal-Wallis normality test """ kw_3d = np.zeros(resid_4d.shape[:-1]) for i in range(resid_4d.shape[0]): for j in range(resid_4d.shape[1]): for k in range(resid_4d.shape[2]): norm_samp = np.random.normal(np.mean(resid_4d[i,j,k,:]), np.std(resid_4d[i,j,k,:]), resid_4d.shape[-1]) junk, kw_3d[i,j,k] = kruskalwallis(resid_4d[i,j,k,:], norm_samp) return kw_3d
def run_correlation(df): global FEATURES global P_SIGNIFICANT results = [] for intention in INTENTION_COLUMNS: for feature in FEATURES: res = {'feature': feature, 'intention': intention} group1 = df[df["intent_current_" + intention] == 0][feature].tolist() group2 = df[df["intent_current_" + intention] == 1][feature].tolist() are_norm = True (s, p) = mstats.normaltest(group1) are_norm = are_norm and (p > P_SIGNIFICANT) (s, p) = mstats.normaltest(group2) are_norm = are_norm and (p > P_SIGNIFICANT) if are_norm: (s, p) = stats.f_oneway(group1, group2) res['test'] = 'One-way ANOVA' res['statistic'] = s res['p'] = p res['mean_0'] = np.mean(group1) res['mean_1'] = np.mean(group2) else: (s, p) = mstats.kruskalwallis(group1, group2) res['test'] = 'Kruskal-Wallis' res['statistic'] = s res['p'] = p res['mean_0'] = np.mean(group1) res['mean_1'] = np.mean(group2) results += [res] return results
def do_kruskal_old(self, region, year, depth, logbooks, subdir): """ do a kruskal-wallis test between two logbook files - the year, region and depth must be the same """ # defines, constant to improve legibility NRUNS = 10 #number of times we ran a gamodel simulation BEST = 4 # index of the column that contain the best individual in the catalog NGEN = 99 # number of the last generation for the gamodel NO_DECLUSTER = 0 # logbook index for the undeclustered catalog DECLUSTER = 1 # logbook index for the declustered catalog # open both files read only NAME_NO_DECLUSTER = '../catalogs/' + logbooks[NO_DECLUSTER] + subdir + '/' + region + \ '_' + year + '_' + depth + '_logbook.txt' log_no_decluster = open(NAME_NO_DECLUSTER, 'r') NAME_DECLUSTER = '../catalogs/' + logbooks[DECLUSTER] + subdir + '/' + region + \ '_' + year + '_' + depth + '_logbook.txt' log_decluster = open(NAME_DECLUSTER, 'r') # create list for the best value of individuals best_no_decluster = [] best_decluster = [] # iterate through the first catalog, registering the solution for the best individual in each gamodel run cur_gen = 0 for line in log_no_decluster: if cur_gen != NGEN: cur_gen += 1 continue else: cur_gen = 0 # start all over contents = line.split() best_no_decluster.append(float(contents[BEST])) # do the same for the second catalog cur_gen = 0 for line in log_decluster: if cur_gen != NGEN: cur_gen += 1 continue else: cur_gen = 0 contents = line.split() best_decluster.append(float(contents[BEST])) # header print("####region: " + str(region) + " year: " + str(year) + " depth: " + str(depth)) # perform a kruskal-wallis simulation try: result = kruskalwallis(best_no_decluster, best_decluster) p_value = result[1] except ValueError: print("Todos os numeros do teste de Kruskal sao iguais, a funcao retorna um erro") p_value = None # print the result - Redirect this to a file if you want print("Mean of the undeclustered: " + str(statistics.mean(best_no_decluster))) print("Mean of the declustered: " + str(statistics.mean(best_decluster))) print("p-value for the kruskal-wallis: " + str(p_value)) print(best_decluster) print(best_no_decluster) print("\n\n\n") input() # close open file pointer log_decluster.close() log_no_decluster.close()
def kwtest(s, groupby, df): return kruskalwallis(*[group[s] for group in stratified(groupby, df)])
def performancesForThreadingTask(self): if not self.perfsCalculated: self.calculatePerformances() durations = [] angularDists = [] speeds = [] averageAccels = [] smoothnesses = [] handednesses = [] speedVariances = [] ambidexterities = [] significances = [] perfs = [] for perf in self.novicePerfs + self.intermediatePerfs + self.expertPerfs: durations.append(perf["duration"]) angularDists.append(perf["angularDist"]) speeds.append(perf["averageSpeed"]) averageAccels.append(perf["averageAccel"]) smoothnesses.append(perf["motionSmoothness"]) handednesses.append(perf["handedness"]) speedVariances.append(perf["speedVariance"]) ambidexterities.append(perf["ambidextricity"][0]) significances.append(perf["ambidextricity"][1]) perfs.append(perf["perf"]) ticks = ( ["N" + str(i) for i in range(1, 11)] + ["I" + str(i) for i in range(1, 11)] + ["E" + str(i) for i in range(1, 11)] ) colours = ["r"] * 10 + ["g"] * 10 + ["b"] * 10 # Kruskal-Wallis tests (Non-parametric ANOVAS) # Between novices, intermediates and experts anova_durations = kruskalwallis(durations[0:10], durations[10:20], durations[20:30]) anova_distances = kruskalwallis(angularDists[0:10], angularDists[10:20], angularDists[20:30]) anova_speeds = kruskalwallis(speeds[0:10], speeds[10:20], speeds[20:30]) anova_accels = kruskalwallis(averageAccels[0:10], averageAccels[10:20], averageAccels[20:30]) anova_smoothness = kruskalwallis(smoothnesses[0:10], smoothnesses[10:20], smoothnesses[20:30]) anova_handedness = kruskalwallis(handednesses[0:10], handednesses[10:20], handednesses[20:30]) anova_variances = kruskalwallis(speedVariances[0:10], speedVariances[10:20], speedVariances[20:30]) anova_ambidexterities = kruskalwallis(ambidexterities[0:10], ambidexterities[10:20], ambidexterities[20:30]) anova_perfs = kruskalwallis(perfs[0:10], perfs[10:20], perfs[20:30]) # Between experts and non-experts anova_two_durations = mannwhitneyu(durations[0:20], durations[20:30]) anova_two_distances = mannwhitneyu(angularDists[0:20], angularDists[20:30]) anova_two_speeds = mannwhitneyu(speeds[0:20], speeds[20:30]) anova_two_accels = mannwhitneyu(averageAccels[0:20], averageAccels[20:30]) # anova_two_smoothness = mannwhitneyu(smoothnesses[0:10]+smoothnesses[10:20],smoothnesses[20:30]) anova_two_smoothness = mannwhitneyu(smoothnesses[0:20], smoothnesses[20:30]) anova_two_handedness = mannwhitneyu(handednesses[0:20], handednesses[20:30]) anova_two_variances = mannwhitneyu(speedVariances[0:20], speedVariances[20:30]) anova_two_ambidexterities = mannwhitneyu(ambidexterities[0:20], ambidexterities[20:30]) anova_two_perfs = mannwhitneyu(perfs[0:20], perfs[20:30]) # SCATTER PLOTS def save_scatter(data, colours, title, ylabel, savePath, yLimTuple=None): fig = plt.figure() ax = fig.add_subplot(111) plt.title(title) plt.xlabel("Trials") plt.ylabel(ylabel) # Sort by value data, colours = zip(*sorted(zip(data, colours))) nov_data = [(i, d) for (i, d, c) in zip(range(len(data)), data, colours) if c == "r"] inter_data = [(i, d) for (i, d, c) in zip(range(len(data)), data, colours) if c == "g"] exp_data = [(i, d) for (i, d, c) in zip(range(len(data)), data, colours) if c == "b"] nov = ax.scatter(zip(*nov_data)[0], zip(*nov_data)[1], color="r", marker="o", s=60) inter = ax.scatter(zip(*inter_data)[0], zip(*inter_data)[1], color="g", marker="^", s=60) exp = ax.scatter(zip(*exp_data)[0], zip(*exp_data)[1], color="b", marker="*", s=60) plt.legend((nov, inter, exp), ["Novice", "Intermediate", "Expert"], loc=2) plt.xticks([]) plt.gca().set_xlim(-1, len(data)) if yLimTuple: plt.gca().set_xlim(yLimTuple) plt.tight_layout() with open(savePath, "w") as figOut: plt.savefig(figOut) save_scatter( durations, colours, "Task Duration", "Time (seconds)", "/Users/robertevans/repos/minf/keyhole_graphs/durations.png", ) save_scatter( angularDists, colours, "Total Angular Distance", "Rotation (radians)", "/Users/robertevans/repos/minf/keyhole_graphs/distances.png", ) save_scatter( speeds, colours, "Average Speed", "Speed (radians/second)", "/Users/robertevans/repos/minf/keyhole_graphs/speeds.png", ) save_scatter( averageAccels, colours, "Average Acceleration", "Acceleration (radians/second$^2$)", "/Users/robertevans/repos/minf/keyhole_graphs/accels.png", ) save_scatter( smoothnesses, colours, "Motion Smoothness", "Smoothness (radians/second$^3$)", "/Users/robertevans/repos/minf/keyhole_graphs/smoothnesses.png", ) save_scatter( handednesses, colours, "Handedness", "Right distance minus left distance per frame (radians)", "/Users/robertevans/repos/minf/keyhole_graphs/handednesses.png", ) save_scatter( speedVariances, colours, "Variance of Angular Speed", "Variance (radians/second)", "/Users/robertevans/repos/minf/keyhole_graphs/variances.png", ) save_scatter( ambidexterities, colours, "Ambidexterity", "Spearman correlation for left/right speeds (per frame)", "/Users/robertevans/repos/minf/keyhole_graphs/ambidexterities.png", ) save_scatter( perfs, colours, "Total Task Performance", "Score (radians$^{-1}$seconds$^{-1}$)", "/Users/robertevans/repos/minf/keyhole_graphs/scores.png", ) # BOX PLOTS def save_box_plot(data, key, p_value, savePath, title, ylabel): plt.figure() plt.title("{0} - p-value: {1:.3g}".format(title, p_value)) plt.ylabel(ylabel) plt.boxplot(data) plt.xticks(range(1, len(data) + 1), key) plt.tight_layout() with open(savePath, "w") as figOut: plt.savefig(figOut) save_box_plot( [durations[:10], durations[10:20], durations[20:]], ("Novices", "Intermediates", "Experts"), anova_durations[1], "/Users/robertevans/repos/minf/keyhole_graphs/durations_box_three.png", "Duration", "Time (seconds)", ) save_box_plot( [angularDists[:10], angularDists[10:20], angularDists[20:]], ("Novices", "Intermediates", "Experts"), anova_distances[1], "/Users/robertevans/repos/minf/keyhole_graphs/distances_box_three.png", "Distance", "Rotation (radians)", ) save_box_plot( [speeds[:10], speeds[10:20], speeds[20:]], ("Novices", "Intermediates", "Experts"), anova_speeds[1], "/Users/robertevans/repos/minf/keyhole_graphs/speeds_box_three.png", "Speed", "Speed (radians/second)", ) save_box_plot( [averageAccels[:10], averageAccels[10:20], averageAccels[20:]], ("Novices", "Intermediates", "Experts"), anova_accels[1], "/Users/robertevans/repos/minf/keyhole_graphs/accels_box_three.png", "Acceleration", "Acceleration (radians/second$^2$)", ) save_box_plot( [smoothnesses[0:10], smoothnesses[10:20], smoothnesses[20:]], ("Novices", "Intermediates", "Experts"), anova_smoothness[1], "/Users/robertevans/repos/minf/keyhole_graphs/smoothnesses_box_three.png", "Smoothness", "Smoothness (radians/second$^3$)", ) save_box_plot( [handednesses[:10], handednesses[10:20], handednesses[20:]], ("Novices", "Intermediates", "Experts"), anova_handedness[1], "/Users/robertevans/repos/minf/keyhole_graphs/handednesses_box_three.png", "Handedness", "Right distance minus left distance per frame (radians)", ) save_box_plot( [speedVariances[:10], speedVariances[10:20], speedVariances[20:]], ("Novices", "Intermediates", "Experts"), anova_variances[1], "/Users/robertevans/repos/minf/keyhole_graphs/variances_box_three.png", "Speed Variance", "Variance (radians/second)", ) save_box_plot( [ambidexterities[:10], ambidexterities[10:20], ambidexterities[20:]], ("Novices", "Intermediates", "Experts"), anova_ambidexterities[1], "/Users/robertevans/repos/minf/keyhole_graphs/ambidexterities_box_three.png", "Ambidexterity", "Spearman correlation for left/right speeds (per frame)", ) save_box_plot( [perfs[:10], perfs[10:20], perfs[20:]], ("Novices", "Intermediates", "Experts"), anova_perfs[1], "/Users/robertevans/repos/minf/keyhole_graphs/scores_box_three.png", "Performance", "Score (radians$^{-1}$seconds$^{-1}$)", ) save_box_plot( [durations[:20], durations[20:]], ("Non-Experts", "Experts"), anova_two_durations[1] * 2, "/Users/robertevans/repos/minf/keyhole_graphs/durations_box_two.png", "Duration", "Time (seconds)", ) save_box_plot( [angularDists[:20], angularDists[20:]], ("Non-Experts", "Experts"), anova_two_distances[1] * 2, "/Users/robertevans/repos/minf/keyhole_graphs/distances_box_two.png", "Distance", "Rotation (radians)", ) save_box_plot( [speeds[:20], speeds[20:]], ("Non-Experts", "Experts"), anova_two_speeds[1] * 2, "/Users/robertevans/repos/minf/keyhole_graphs/speeds_box_two.png", "Speed", "Speed (radians/second)", ) save_box_plot( [averageAccels[:20], averageAccels[20:]], ("Non-Experts", "Experts"), anova_two_accels[1] * 2, "/Users/robertevans/repos/minf/keyhole_graphs/accels_box_two.png", "Acceleration", "Acceleration (radians/second$^2$)", ) save_box_plot( [smoothnesses[0:10] + smoothnesses[10:20], smoothnesses[20:]], ("Non-Experts", "Experts"), anova_two_smoothness[1] * 2, "/Users/robertevans/repos/minf/keyhole_graphs/smoothnesses_box_two.png", "Smoothness", "Smoothness (radians/second$^3$)", ) save_box_plot( [handednesses[:20], handednesses[20:]], ("Non-Experts", "Experts"), anova_two_handedness[1] * 2, "/Users/robertevans/repos/minf/keyhole_graphs/handednesses_box_two.png", "Handedness", "Right distance minus left distance per frame (radians)", ) save_box_plot( [speedVariances[:20], speedVariances[20:]], ("Non-Experts", "Experts"), anova_two_variances[1] * 2, "/Users/robertevans/repos/minf/keyhole_graphs/variances_box_two.png", "Speed Variance", "Variance (radians/second)", ) save_box_plot( [ambidexterities[:20], ambidexterities[20:]], ("Non-Experts", "Experts"), anova_two_ambidexterities[1] * 2, "/Users/robertevans/repos/minf/keyhole_graphs/ambidexterities_box_two.png", "Ambidexterity", "Spearman correlation for left/right speeds (per frame)", ) save_box_plot( [perfs[:20], perfs[20:]], ("Non-Experts", "Experts"), anova_two_perfs[1] * 2, "/Users/robertevans/repos/minf/keyhole_graphs/scores_box_two.png", "Performance", "Score (radians$^{-1}$seconds$^{-1}$)", ) plt.show()
from os import path from collections import defaultdict from util import pretty_name, median_deviation from scipy.stats.mstats import kruskalwallis if __name__ == '__main__': # Run through all of the files gathering different seeds into lists statify = defaultdict(list) active = defaultdict(list) filecount = 0 for filename in sys.argv[1:]: base = path.basename(filename) try: problem, nodes, version, seed = base.split('_') with open(filename, 'r') as f: data = json.load(f) statify[version].append(data[1]['evals']) active[version].append(data[1]['phenotype']) filecount += 1 except ValueError: print filename, "FAILED" print 'Files Successfully Loaded', filecount print 'Kruskal Wallis', kruskalwallis(statify.values()) for version, data in statify.iteritems(): print '--------- %s ---------' % pretty_name[version] print "MES, MAD", median_deviation(data) print 'Active', median_deviation(active[version]) print 'Mann Whitney U against Normal', print stats.mannwhitneyu(statify['normal'], data)
se.dzs_mean + se.dzs_std, facecolor='r', alpha=alphafill, lw=0.01) intarr = N.concatenate(tuple([dz for dz in interior.normdz]), axis=2).reshape(len(interior.normdz), 100) scarr = N.concatenate(tuple([dz for dz in sc.normdz]), axis=2).reshape(len(sc.normdz), 100) searr = N.concatenate(tuple([dz for dz in se.normdz]), axis=2).reshape(len(se.normdz), 100) sarr = N.vstack((scarr, searr)) kw = [] for i in xrange(100): kw.append(kruskalwallis(intarr[:, i], sarr[:, i])[1]) #ax.plot(interior.norme,inarr[0,:],'--k') plt.legend([l1, l2, l3], [ 'Interior N=%s' % str(len(interior.name)), 'South Central N=%s' % str(len(sc.name)), 'Southeast N=%s' % str(len(se.name)) ], loc='lower left', fontsize=9) font = matplotlib.font_manager.FontProperties(family='Arial', weight='bold', size=15) #ax.annotate('A',[0.07,0.5],horizontalalignment='center',verticalalignment='center',fontproperties=font)
# ------------------------------------------------------------------- # 1.caculate the square differenz between two vectors sq_diff_ab = np.square(mean_vector_amp - mean_vector_bp) sse_ab = np.sum(sq_diff_ab) norm_ab = np.sqrt(sse_ab) print('the L2-Norm is %.2f' % norm_ab) # 2.threshold and ratio counter = 0 threshold = 0.01 print('the threshold is %.2f%%' % (threshold * 100)) for i in range(num_elements): diff = np.abs(mean_vector_amp[0][i] - mean_vector_bp[0][i]) if diff <= threshold: counter += 1 ratio = float(counter) / num_elements print('the ratio is %.2f%%' % (ratio * 100)) # 3.caculate the correlation between two vectors cocoef_matrix = np.corrcoef(mean_array_amp, mean_array_bp) cocoef = cocoef_matrix[0, 1] print('the correlation coefficient is %0.3f' % cocoef) # 4.kruskalwallis test for median difference between two distribution H, pvalue = kruskalwallis(mean_vector_amp[0], mean_vector_bp[0]) print('the p-value is %.2f' % pvalue) if pvalue > 0.05: print("accept null hypothesis: no significant difference between two groups") # -------------------------------------------------------------------
def FindDwellTimes(concat_table): vertical_stats = { 'i' : { 'on_dwell': [], 'off_dwell':[] ,\ 'on_count':0.0, 'off_count':0.0, \ 'pos_dwell' : {0:[],1:[], 2:[], 3:[], 4:[]},\ 'all_clicks':[], \ 'clicks':{0:0.0,1:0.0, 2: 0.0, 3:0.0, 4:0.0}}, \ 'w' : { 'on_dwell': [], 'off_dwell':[],'on_count':0.0,\ 'off_count':0.0 ,'all_clicks':[],\ 'pos_dwell' : {0:[],1:[], 2:[], 3:[], 4:[]} ,\ 'clicks':{0:0.0,1:0.0, 2: 0.0, 3:0.0, 4:0.0}}, \ 'o' : { 'on_dwell': [], 'off_dwell':[],'on_count':0.0,\ 'off_count':0.0,'pos_dwell' : {0:[],1:[], 2:[], 3:[], 4:[]},\ 'all_clicks':[],\ 'clicks':{0:0.0,1:0.0, 2: 0.0, 3:0.0, 4:0.0} }, \ 'v' : { 'on_dwell': [], 'off_dwell':[],'on_count':0.0,\ 'off_count':0.0 ,'all_clicks':[], \ 'pos_dwell' : {0:[],1:[], 2:[], 3:[], 4:[]},\ 'clicks':{0:0.0,1:0.0, 2: 0.0, 3:0.0, 4:0.0}, \ } \ } # Group by task_id and query_id and Sort by time within each group. grouped_table = concat_table.groupby(['user_id','task_id']) for name, group in grouped_table: group = group.sort('time') rows = [] results = {} serp_clicks = 0 vert_type = None recorded_clicks = {} for index, row in group.iterrows(): rows.append(row) for i in range(len(rows)): row = rows[i] # Store results. if row['type'] == 'results': results[row['doc_pos']] = row if row['type'] == 'results' and row['doc_pos'] == 0: # For each page find time it was tapped or clicked. # Take the min for dwell time. vtype = None for curl, stats in recorded_clicks.items(): vtype = stats['type'] if stats['rank'] < 5: vertical_stats[vtype]['pos_dwell'][stats['rank']].append(min(stats['time'])) vertical_stats[vtype]['clicks'][stats['rank']]+=1 if stats['rank'] == 0: vertical_stats[vtype]['on_dwell'].append(min(stats['time'])) vertical_stats[vtype]['on_count']+=1.0 else: vertical_stats[vtype]['off_dwell'].append(min(stats['time'])) vertical_stats[vtype]['off_count']+=1.0 if vtype and serp_clicks > 0: vertical_stats[vtype]['all_clicks'].append(serp_clicks) recorded_clicks = {} serp_clicks = 0 vert_type = str(row['doc_type']).strip() # Found a tap or a click start_time = row['time'] end_time = None found = False click_rank = None click_url = None if (row['type'] == 'event' and 'tap' in row['event_type'] and\ row['element'] > -1) or (row['type'] == 'click') : if row['type'] == 'event': click_url = results[row['element']]['doc_url'] click_rank = int(row['element']) if (row['type'] == 'click'): click_rank = int(row['doc_id'][row['doc_id'].find('_')+1:]) click_url = row['doc_url'] # Check if page response for this url has been submitted. j = i+1 while (j < len(rows)) and (not (rows[j]['type'] == 'results')): if rows[j]['type'] == 'page_response': if (rows[j]['doc_url'] in click_url) or\ editdistance.eval(click_url, rows[j]['doc_url']) < 20: found = True end_time = rows[j]['time'] break if rows[j]['type'] == 'event' and 'tap' not in rows[j]['event_type']: found = True end_time = rows[j]['time'] break if rows[j]['type'] == 'task_response': # user did not provide page or serp feedback :( found = True end_time = rows[j]['time'] break if found : break j+=1 if found and end_time: if click_url not in recorded_clicks: recorded_clicks[click_url] ={'rank':None, 'type':None,'time':[]} recorded_clicks[click_url]['time'].append((end_time-start_time).total_seconds()) recorded_clicks[click_url]['rank']= click_rank recorded_clicks[click_url]['type']= vert_type serp_clicks = len(recorded_clicks) else: print 'Cannot find in responses', click_url,\ row['user_id'], row['task_id'], row['type'] ''' vtype = None for curl, stats in recorded_clicks.items(): vtype = stats['type'] if stats['rank'] < 5: vertical_stats[vtype]['pos_dwell'][stats['rank']].append(min(stats['time'])) vertical_stats[vtype]['clicks'][stats['rank']]+=1 if stats['rank'] == 0: vertical_stats[vtype]['on_dwell'].append(min(stats['time'])) vertical_stats[vtype]['on_count']+=1.0 else: vertical_stats[vtype]['off_dwell'].append(min(stats['time'])) vertical_stats[vtype]['off_count']+=1.0 if vtype and serp_clicks > 0: vertical_stats[vtype]['all_clicks'].append(serp_clicks) ''' for vertical, val_dict in vertical_stats.items(): print vertical, 'on-dwell',np.mean(val_dict['on_dwell']),\ np.std(val_dict['on_dwell']),'off-dwell', np.mean(val_dict['off_dwell']), \ np.std(val_dict['off_dwell']), val_dict['on_count'],\ val_dict['off_count'] verticals = ['i','o','w','v'] for i in range(len(verticals)): v1 = verticals[i] for j in range(i+1, len(verticals)): v2 = verticals[j] for attribute in ['on_dwell','off_dwell', 'all_clicks']: print 'Krusk wallis',v1, v2,attribute, \ kruskalwallis(vertical_stats[v1][attribute], \ vertical_stats[v2][attribute]) for vert_type, stats in vertical_stats.items(): for pos , array in stats['pos_dwell'].items(): print 'Man pos dwell ',vert_type, pos, np.median(array), \ kruskalwallis(array,vertical_stats['o']['pos_dwell'][pos]) for pos in stats['clicks'].keys(): vertical_stats[vert_type]['clicks'][pos]/= (stats['on_count']+\ stats['off_count']) PlotClickDist(vertical_stats)
def FindPageMetricsPerVertical(result_table, page_table): # Concat result and page tables result_table['type'] = 'results' page_table['type'] = 'page_response' concat_table = pd.concat([result_table, page_table], ignore_index = True) concat_table.to_csv('concat_result_page',encoding='utf-8', index = False) # Group by user_id and task_id and sort by time within each group grouped_table = concat_table.sort(['time']).groupby(['user_id','task_id']) # Set vertical type for each serp. vert_type = None concat_table['first_result_type'] = '' # Iterate over all users and tasks for name, group in grouped_table: # Iterate over all page response results # for a specific user and a task for pindex, prow in group.iterrows(): if prow['type'] == 'results' and prow['doc_pos'] == 0: vert_type = prow['doc_type'] # Skip if the row is not a page_response # Skip if its an invalid page # serp pages are invalid pages # because no doc_pos for serp if prow['type'] != 'page_response' or IsSerpPage(prow['doc_url']): continue ptime = prow['time'] purl = prow['doc_url'] ppos = -1 # Find the doc_pos from result entry for rindex, rrow in group.iterrows(): # Get the doc_pos from the result entry # whose url matches with the page response url if rrow['type'] == 'results' and (purl in rrow['doc_url']): ppos = rrow['doc_pos'] # Search only those result entries which # has timestamp lower than the page response time if rrow['time'] > ptime: break # if ppos = -1 that means we did not find the match # TODO: handle ppos=-1 case #prow['doc_pos'] = ppos concat_table.set_value(pindex,'doc_pos',ppos) concat_table.set_value(pindex,'first_result_type',vert_type) # Filter rows with page responses. page_responses = concat_table[concat_table['type'] == 'page_response'] page_responses = page_responses[page_responses['first_result_type'].str.len() == 1] first_rel_group = page_responses[page_responses['doc_pos'] ==0].groupby(\ ['first_result_type', 'response_type']) last_rel_group = page_responses[page_responses['doc_pos']>0].groupby(\ ['first_result_type','response_type']) print page_responses[page_responses['doc_pos'] ==0].groupby(\ ['first_result_type', 'response_type']).agg({ # Find the mean and std rel and satisfaction. 'response_value' : { 'mean': 'mean', 'std-dev' : 'std', 'count' : 'count' } }) print page_responses[page_responses['doc_pos']>0].groupby(\ ['first_result_type','response_type']).agg({ # Find the mean and std rel and satisfaction. 'response_value' : { 'mean': 'mean', 'std-dev' : 'std', 'count' : 'count' } }) verticals = ['i','o','w','v'] for i in range(len(verticals)): v1 = verticals[i] for j in range(i, len(verticals)): v2 = verticals[j] for attribute in ['relevance','satisfaction']: print 'Krusk wallis',v1, v2,attribute,'first_rank', \ kruskalwallis(first_rel_group.get_group((v1,attribute))['response_value'],\ first_rel_group.get_group((v2, attribute))['response_value']) print 'Krusk wallis', v1, v2, attribute, 'rel_off_rank',\ kruskalwallis(last_rel_group.get_group((v1,attribute))['response_value'],\ last_rel_group.get_group((v2,attribute))['response_value']) print 'Man sat_first_rank v-o',\ pearsonr(first_rel_group.get_group(('v','satisfaction'))['response_value'],\ first_rel_group.get_group(('v','relevance'))['response_value']) print 'Man sat_first_rank w-o',\ (first_rel_group.get_group(('w','satisfaction'))['response_value'],\ first_rel_group.get_group(('w','relevance'))['response_value']) print 'Man sat_first_rank o-o',\ kendalltau(first_rel_group.get_group(('o','satisfaction'))['response_value'],\ first_rel_group.get_group(('o','relevance'))['response_value']) # Find the variation in page satisfaction and relevance for # each position per vertical. rank_level_rel_and_sat = page_responses[page_responses['doc_pos']< 3].groupby(\ ['first_result_type', 'response_type', 'doc_pos']).agg({ # Find the mean and std rel and satisfaction. 'response_value' : { 'mean': 'mean', 'std-dev' : 'std', 'count' : 'count' } }) rank_level_rel_and_sat.reset_index().to_csv('vert_level_pos_level_rel_and_sat.csv',index='False') PlotSatAndRelBoxPlotPerVertical(first_rel_group, 'Page Response',\ 'rel_sat_first_pos.png')
def FindVisiblityMetricsPerVertical(result_table,vis_event_table): concat_table = pd.concat([result_table, vis_event_table], ignore_index = True) # Initialize visiblity metric # Stores #sessions in which # each card was visible visibility = {} visibility['i'] = np.zeros(10) visibility['v'] = np.zeros(10) visibility['w'] = np.zeros(10) visibility['o'] = np.zeros(10) # Initialize time metric # Stores the total time # for which each card was visible visible_time = {} visible_time['i'] ={ 0: [], 1: [] , 2: [], 3: [], 4: [] ,5:[], 6:[], 7:[], 8:[],9:[]} visible_time['v'] ={ 0: [], 1: [] , 2: [], 3: [], 4: [] ,5:[], 6:[], 7:[], 8:[],9:[]} visible_time['w'] ={ 0: [], 1: [] , 2: [], 3: [], 4: [] ,5:[], 6:[], 7:[], 8:[],9:[]} visible_time['o'] ={ 0: [], 1: [] , 2: [], 3: [], 4: [] ,5:[], 6:[], 7:[], 8:[],9:[]} grouped_table = concat_table.groupby(['user_id']) for name, group in grouped_table: group = group.sort('time') # Top vertical in the session top_vert = None # time of the previous event in the session prev_time = None # card visibility for the session # 1: visible 0: invisible card_vis = np.zeros(10) # card status and time for the session # card_status stores time when it became visible or 0 if its invisible # card_time stores time in seconds card_status = 10*[None] card_time = np.zeros(10) # Process sessions for this user for index, row in group.iterrows(): # Row type 'result' indicates the start of a new session if row['type'] == 'results': # Save results of a previous session if top_vert != None: visibility[top_vert] = visibility[top_vert] + card_vis card_vis = np.zeros(10) # Compute the time for cards that were visible # at the end of the previous session for cid in range(0,10): if card_status[cid] != None: time_diff = (row['time']-card_status[cid]).total_seconds() if time_diff < MAX_CARD_DWELL_TIME: card_time[cid] = card_time[cid] + time_diff else: time_diff = (prev_time-card_status[cid]).total_seconds() if time_diff < MAX_CARD_DWELL_TIME: card_time[cid] = card_time[cid] + time_diff else: # setting the dwell time to default card_time[cid] = card_time[cid] + DEFAULT_CARD_DWELL_TIME visible_time[top_vert][cid].append(card_time[cid]) card_status = 10*[None] card_time = np.zeros(10) top_vert = row['doc_type'] # Otherwise it is the event row # Update stats of the current session else: card_vis = UpdateCardVisibility(row['event_value'],card_vis) card_status, card_time = UpdateCardTime(row['event_value'],card_status,card_time,row['time']) prev_time = row['time'] # Save results of the last session of this user if top_vert != None: visibility[top_vert] = visibility[top_vert] + card_vis # This is the last session of this user # so we do not have enough information # to compute dwell time for the cards # that were visible at the end of the session # We simply add default card dwell time for cid in range(0,10): if card_status[cid] != None: card_time[cid] = card_time[cid] + DEFAULT_CARD_DWELL_TIME visible_time[top_vert][cid].append(card_time[cid]) verticals = visible_time.keys() for vertical in verticals: print vertical ,visibility[vertical] for vertical in verticals: print vertical,' '.join([str(round(sum(card_times)/visibility[vertical][0],3))\ for card_times in visible_time[vertical].values()]) for vertical in verticals: print vertical,' '.join([str(round(np.median(card_times),3))\ for card_times in visible_time[vertical].values()]) for vertical in verticals: print 'Median time',vertical, np.median(visible_time[vertical][0]) for i in range(len(verticals)): v1 = verticals[i] for j in range(i, len(verticals)): v2 = verticals[j] for pos in range(4): print 'Man visibilit time', v1, v2, pos,\ kruskalwallis(visible_time[v1][pos],visible_time[v2][pos]) PlotMultipleBoxPlotsPerVertical(visible_time, [1,2,3,4,5],'Document Positions',\ 'Viewport Time (sec)','','view_port_time.png')
100.0, 98.507462686567166, 98.507462686567166, 97.761194029850742, 97.761194029850742],\ [90.370370370370367, 97.037037037037038, 94.074074074074076, 95.555555555555557, 91.111111111111114,\ 95.555555555555557, 97.037037037037038, 97.777777777777771, 95.555555555555557, 94.81481481481481],\ [93.814432989690715, 91.75257731958763, 93.814432989690715, 93.814432989690715, 93.814432989690715,\ 89.69072164948453, 96.907216494845358, 92.783505154639172, 95.876288659793815, 98.969072164948457]\ ]) ############### Tool's MAX.ACC PER CORPUS ################# #max_accs_array = np.array([ [ 95.26, 95.56, 95.70, 95.18, 95.85, 98.13, 94.88, 94.12 ],\ # [ 91.53, 90.84, 89.16, 90.23, 91.03, 91.99, 87.09, 90.81 ],\ # [ 91.47, 92.00, 91.73, 92.27, 90.67, 87.43, 88.27, 91.23 ],\ # [ 92.00, 93.60, 92.27, 93.33, 92.27, 95.27, 89.33, 92.57 ],\ # ]) print spm.kruskalwallis(max_accs_array[0],max_accs_array[1],max_accs_array[2],max_accs_array[3],\ max_accs_array[4],max_accs_array[6],max_accs_array[7]) print spm.kruskalwallis(max_accs_array[5],max_accs_array[2]) #print sps.wilcoxon(max_accs_array[1],max_accs_array[4]) print spm.mannwhitneyu(max_accs_array[5],max_accs_array[2]) #, use_continuity=True) 0/0 #print max_accs_array #import matplotlib.pyplot as plt #plt.hist(max_accs_array[1], 10) #bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, hold)) #plt.figure() #plt.plot([1,2,3,4,5,6,7,8,9,10],max_accs_array[7] ) #, histtype='bar', rwidth=0.8) #plt.show() print max_accs_array, "\n"
print 'AUC: ' + str(auc) from statsmodels.stats.multicomp import pairwise_tukeyhsd from scipy.stats.mstats import kruskalwallis, friedmanchisquare Array = ['NL','BPH','HGPIN','G3','G4','G5'] multiarea = magi_area['NL'].append(magi_area['BPH']) multiarea = multiarea.append(magi_area['HGPIN']) multiarea = multiarea.append(magi_area['G3']) multiarea = multiarea.append(magi_area['G4']) multiarea = multiarea.append(magi_area['G5']) multiarea = multiarea.dropna() multilesion = list() a = 0 while a < 6: column = Array[a] coldata = magi_area[column] coldata = coldata.dropna() for deet in coldata: multilesion.append(a) a = a + 1 print pairwise_tukeyhsd(multiarea, multilesion) print kruskalwallis(magi_area['NL'].dropna(), magi_area['BPH'].dropna(), magi_area['HGPIN'].dropna(), magi_area['G3'].dropna(), magi_area['G4'].dropna(), magi_area['G5'].dropna()) print kruskalwallis(magi_stain['NL'].dropna(), magi_stain['BPH'].dropna(), magi_stain['HGPIN'].dropna(), magi_stain['G3'].dropna(), magi_stain['G4'].dropna(), magi_stain['G5'].dropna())
def ClusterAssociations(Raw, Symbols, Types, Labels, Tau=0.05): """ Examines associations between cluster assigments of samples and copy-number and mutation events. Parameters ---------- Raw : array_like Numpy array containing raw, unnormalized feature values. These are used to examine associations between feature values and cluster assignments. Features are in columns and samples are in rows. Symbols : array_like List containing strings describing features. See Notes below for restrictions on symbol names. Types: array_like List containing strings describing feature types (e.g. CNV, Mut, Clinical). See notes on allowed values of Types below. Labels : array_like Cluster labels for the samples in 'Raw'. Tau : scalar Threshold for statistical significance when examining cluster associations. Returns ------- Significant : array_like List of copy number and mutation features from 'Raw' that are significantly associated with the clustering 'Labels'. SigTypes : array_like List of types for significant features. Notes ----- Types like 'Mut' and 'CNV' that are generated as suffixes to feature names by the package tcgaintegrator are required analysis. See Also -------- RiskCohort, RiskCluster """ # initialize list of symbols with significant associations and their types Significant = [] SigTypes = [] # identify mutations and CNVs Mutations = [index for index, tpe in enumerate(Types) if tpe == "Mut"] CNVs = [index for index, tpe in enumerate(Types) if tpe == "CNV"] # test mutation associations for i in np.arange(len(Mutations)): # build contingency table - expected and observed Observed = np.zeros((2, np.max(Labels))) for j in np.arange(1, np.max(Labels) + 1): Observed[0, j - 1] = np.sum(Raw[Labels == j, Mutations[i]] == 0) Observed[1, j - 1] = np.sum(Raw[Labels == j, Mutations[i]] == 1) RowSum = np.sum(Observed, axis=0) ColSum = np.sum(Observed, axis=1) Expected = np.outer(ColSum, RowSum) / np.sum(Observed.flatten()) # perform test stat, p = chisquare(Observed, Expected, ddof=1, axis=None) if p < Tau: Significant.append(Symbols[Mutations[i]]) SigTypes.append(Types[Mutations[i]]) # copy number associations for i in np.arange(len(CNVs)): # separate out CNV values by cluster and perform test - hack for bad # interfact to scipy kruskalwallis if (np.max(Labels) == 2): CNV1 = Raw[Labels == 1, CNVs[i]] CNV2 = Raw[Labels == 2, CNVs[i]] stat, p = kruskalwallis(CNV1, CNV2) elif (np.max(Labels) == 3): CNV1 = Raw[Labels == 1, CNVs[i]] CNV2 = Raw[Labels == 2, CNVs[i]] CNV3 = Raw[Labels == 3, CNVs[i]] stat, p = kruskalwallis(CNV1, CNV2, CNV3) elif (np.max(Labels) == 4): CNV1 = Raw[Labels == 1, CNVs[i]] CNV2 = Raw[Labels == 2, CNVs[i]] CNV3 = Raw[Labels == 3, CNVs[i]] CNV4 = Raw[Labels == 4, CNVs[i]] stat, p = kruskalwallis(CNV1, CNV2, CNV3, CNV4) elif (np.max(Labels) == 5): CNV1 = Raw[Labels == 1, CNVs[i]] CNV2 = Raw[Labels == 2, CNVs[i]] CNV3 = Raw[Labels == 3, CNVs[i]] CNV4 = Raw[Labels == 4, CNVs[i]] CNV5 = Raw[Labels == 5, CNVs[i]] stat, p = kruskalwallis(CNV1, CNV2, CNV3, CNV4, CNV5) if p < Tau: Significant.append(Symbols[CNVs[i]]) SigTypes.append(Types[CNVs[i]]) # return names of features with significant associations return Significant, SigTypes
# output = open(args["index"], "w") kw = [] # loop over images # for imagePath in glob.glob(args["dataset"] + "/*.jpg"): for imagePath in glob.glob(dataset + "/*.jpg"): imageID = imagePath[imagePath.rfind("/") + 1:] image = cv2.imread(imagePath) # describe image features = cd.describe(image) temp = kruskalwallis(features) kw.append((imageID, temp[0], temp[1])) # kw = [str(f) for f in kw] # output.write("%s, %s\n" % (imageID, ",".join(kw))) # features = [str(f) for f in features] # output.write("%s, %s\n" % (imageID, ",".join(features))) # output = open(args["index"], "w") # # order by # kw = sorted(kw, key=lambda l:l[1]) # with open(args["index"], 'wb') as f: # wr = csv.writer(f) # wr.writerows(kw)
def ComputePreClickDistributions(merged_table): # Compute before each click the scatter plot of # time before click and max rank. first_click_time_and_pos = {'i':[], 'v':[], 'w':[], 'o':[]} grouped_table = merged_table.groupby(['user_id','task_id']) for name, group in grouped_table: group = group.sort('time') first_click_pos = None first_click_time = None first_result_type = None first_event_time = None last_result_before_click = None for index, row in group.iterrows(): # Get the time and max visible result pos for each # vertical. if row['type'] == 'results': if first_click_time and last_result_before_click > -1 \ and first_click_pos > -1 and first_result_type: first_click_time_and_pos[first_result_type].append(\ (first_click_time, first_click_pos, last_result_before_click)) first_click_pos = None first_click_time = None first_event_time = None first_result_type = None last_result_before_click = None first_result_type = row['doc_type'] if row['type'] == 'event' and (first_click_pos == None): # Record time of first event. if not first_event_time: first_event_time = row['time'] # Find the maximum result visible. if len(row['visible_elements']) > 0: for entry in row['visible_elements'].split(): last_result_before_click = max(last_result_before_click,\ int(entry[entry.rfind('_')+1:])) event_type = row['event_type'] if 'tap' in event_type and (first_click_pos == None): # set the time. first_click_time = (row['time'] - first_event_time).total_seconds() first_click_pos = int(row['element']) if (row['type'] == 'click') and (first_click_pos == None): first_click_time = (row['time'] - first_event_time).total_seconds() first_click_pos = int(row['doc_id'][row['doc_id'].rfind('_')+1:]) if first_click_time and last_result_before_click > -1 \ and first_click_pos > -1 and first_result_type: first_click_time_and_pos[first_result_type].append(\ (first_click_time, first_click_pos, last_result_before_click)) # Scatter Plots did not lead to any information or significant difference # between different verticals. # Plot the following scatter plots: # 1. Time to rank viewed. scatter1 = {} last_viewed_result = {} for result_type, time_and_pos_array in first_click_time_and_pos.items(): # format vert_type : {pos : [time list (sec)]} scatter1[result_type] ={} last_viewed_result[result_type] =[] # Sort by last viewed result (the format is time, pos, last_viewd_rank) sorted_tuple_by_view_rank = sorted(time_and_pos_array, key = lambda x : x[2]) for sorted_tuple in sorted_tuple_by_view_rank: click_rank = sorted_tuple[1] +1 view_rank = sorted_tuple[2] +1 last_viewed_result[result_type].append(view_rank) if click_rank not in scatter1[result_type]: scatter1[result_type][click_rank] = [] # Time should be less than 100 seconds scatter1[result_type][click_rank].append(sorted_tuple[0]) for vert, dictionary in scatter1.items(): print 'krusk walllis ', vert, kruskalwallis(last_viewed_result[vert],last_viewed_result['o']) for rank, array in dictionary.items(): if rank in scatter1['o']: print 'krusk walllis ', vert,rank, kruskalwallis(array,scatter1['o'][rank]) PlotVerticalLevelAttributeBoxPlot(last_viewed_result,'', 11, ['Lowest Examined Snippet'],\ 'Snippet Rank', '', 'last_viewed_snippet.png')
# Determine the settings from the filename problem, dup, ordering, nodes, mut, seed = base.split('_') with open_file_method(filename)(filename, 'r') as f: data = json.load(f) version = dup, ordering, nodes, mut if (dup, ordering) == ('skip', 'normal'): control_group = version statify[version].append(data[1]['evals']) active[version].append(data[1]['phenotype']) best = data[1]['bests'][-1] test = data[1]['test_inputs'] individual = Individual.reconstruct_individual(best, test) simplified = individual.new(Individual.simplify) reduced[version].append(len(simplified.active)) filecount += 1 except ValueError: print filename, "FAILED" # Kruskal's requires a rectangular matrix rect = make_rectangular(statify.values(), 10000001) print 'Files Successfully Loaded', filecount print 'Kruskal Wallis', kruskalwallis(rect) for version, data in statify.iteritems(): print '--------- %s ---------' % str(version) print "MES, MAD", median_deviation(data) print 'Active', median_deviation(active[version]) print 'Reduced', median_deviation(reduced[version]) print 'Mann Whitney U against Control', print mannwhitneyu(statify[control_group], data)
def uw_tier_histplots(): sample['Underwriter Tier'] = sample['lead_underwriter_tier'] sample['IPO Duration'] = sample['IPO_duration'] ranks = ["-1", "0+", "7+", "9"] def uw_tier_duration(x): return sample[sample.lead_underwriter_tier==x]['IPO_duration'] kwstat = kruskalwallis(*[uw_tier_duration(x) for x in ranks]) # g = sb.FacetGrid(sample, # row="Underwriter Tier", # hue="Underwriter Tier", # palette=cp_four("cool_r"), # size=2, aspect=4, # hue_order=ranks, row_order=ranks, # legend=ranks, xlim=(0,1095)) # g.map(sb.distplot, "IPO Duration") # plt.savefig("IPO_tiers_KP_survival.pdf", format='pdf', dpi=200) from lifelines.estimation import KaplanMeierFitter from lifelines.statistics import logrank_test import matplotlib.pyplot as plt ranks = ["-1", "0+", "7+", "9"] ranklabels = ['No Underwriter', 'Low Rank', 'Mid Rank', 'Rank 9 (elite)'] kmf = KaplanMeierFitter() # Success f, ax = plt.subplots(1,1,figsize=(12, 4), sharex=True) T = 1 # annotation line thickness for rank, rlabel, color in zip(ranks, ranklabels, cp_four("cool_r")): uw = sample[sample.lead_underwriter_tier==rank] kmf.fit(uw['IPO_duration'], label='{} N={}'.format(rlabel, len(uw)), alpha=0.9) kmf.plot(ax=ax, c=color, alpha=0.7) quartiles = [int(np.percentile(kmf.durations, x)) for x in [25, 50, 75]][::-1] aprops = dict(facecolor=color, width=T, headwidth=T) if rank=="-1": plt.annotate("75%: {} days".format(quartiles[0]), (quartiles[0], 0.25), xytext=(quartiles[0]+145, 0.25+.04), arrowprops=aprops) plt.annotate("50%: {} days".format(quartiles[1]), (quartiles[1], 0.50), xytext=(quartiles[1]+145, 0.50+.04), arrowprops=aprops) plt.annotate("25%: {} days".format(quartiles[2]), (quartiles[2], 0.75), xytext=(quartiles[2]+145, 0.75+0.04), arrowprops=aprops) elif rank=="9": plt.annotate("75%: {} days".format(quartiles[0]), (quartiles[0], 0.25), xytext=(quartiles[0]+415, 0.25+.1), arrowprops=aprops) plt.annotate("50%: {} days".format(quartiles[1]), (quartiles[1], 0.50), xytext=(quartiles[1]+290, 0.50+.1), arrowprops=aprops) plt.annotate("25%: {} days".format(quartiles[2]), (quartiles[2], 0.75), xytext=(quartiles[2]+165, 0.75+0.1), arrowprops=aprops) plt.annotate("Kruskall Wallis\nH: {:.3f}\nprob: {:.3f}".format(*kwstat), (960, 0.1)) plt.ylim(0,1) plt.xlim(0,1095) plt.title("Kaplan-Meier survival times by bank tier") plt.xlabel("IPO Duration (days)") plt.ylabel(r"$S(t)=Pr(T>t)$") plt.savefig("IPO_tiers_KP_survival.pdf", format='pdf', dpi=200)
#Plot the data: my_colors = 'rgbkymc' #red, green, blue, black, yellow, purple, cyan etc. s.plot( kind='bar', color=my_colors, ) plt.show() from scipy.stats import mstats Col_1 = [1,2,3] Col_2 = [1000,20000,1000] Col_3 = [100,203,109] Col_4 = [1,3,5] print("Kruskal Wallis H-test test:") H, pval = mstats.kruskalwallis(Col_1, Col_2, Col_3, Col_4) print("H-statistic:", H) print("P-Value:", pval) if pval < 0.05: print("Reject NULL hypothesis - Significant differences exist between groups.") if pval > 0.05: print("Accept NULL hypothesis - No significant difference between groups.")
'''Example of a Kruskal-Wallis test (for not normally distributed data) ''' ''' Author: Thomas Haslwanter Date: March-2013 Ver: 1.0 ''' from scipy.stats.mstats import kruskalwallis from numpy import array # And finally, give an example of the Kruskal-Wallis test # Taken from http://www.brightstat.com/index.php?option=com_content&task=view&id=41&Itemid=1&limit=1&limitstart=2 # Get the data city1 = array([68, 93, 123, 83, 108, 122]) city2 = array([119, 116, 101, 103, 113, 84]) city3 = array([70, 68, 54, 73, 81, 68]) city4 = array([61, 54, 59, 67, 59, 70]) # Perform the Kruskal-Wallis test h, p = kruskalwallis(city1, city2, city3, city4) # Print the results if p<0.05: print('There is a significant difference between the cities.') else: print('No significant difference between the cities.')
stepsize = arg_map["stepsize"] log("Computing Kruskal-Wallis test on windows of size %d and step %d" % (windowsize, stepsize)) testresults = [] for window in BSWindowGen(positions, windowsize, stepsize): covcheck = lambda c: c >= arg_map["mincov"] filt_pos = filter(lambda position: all(map(covcheck, map(baseCoverage, position.samples))), window.positions) if not len(filt_pos) >= arg_map["minwinsites"]: continue pos_by_sample = zip(*map(attrgetter("samples"), filt_pos)) methyl_by_sample = map(lambda bases: map(methValue, bases), pos_by_sample) try: (h, p) = kruskalwallis(*methyl_by_sample) except Exception as e: sys.stderr.write("Error: %s \n" % e) continue testresults.append(KWResult(p, h, window)) log("Sorting Results") testresults.sort() log("Writing Output") m = float(len(testresults)) q = arg_map["q"] k = 0 for result in testresults:
def KWtest(Matrixs, Words, WordLists, option='CustomP', Low=0.0, High=1.0): """ give the kruskal wallis test result on the topword :param Matrixs: every element is a group Matrix that contain the word counts, each represent a segement. :param Words: all the words (Matrixs and words are parallel) :param WordLists: a list of dictionary that has the word map to its word count. each dictionary represent the information inside a segment :param option: some default option to set for High And Low(see the document for High and Low) 1. using standard deviation to find outlier TopStdE: only analyze the Right outlier of word, determined by standard deviation (word frequency > average + 2 * Standard_Deviation) MidStdE: only analyze the Non-Outlier of word, determined by standard deviation (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation) LowStdE: only analyze the Left Outlier of word, determined by standard deviation (average - 2 * Standard_Deviation > word frequency) 2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED* TopIQR: only analyze the Top outlier of word, determined by IQR (word frequency > median + 1.5 * Standard) MidIQR: only analyze the non-outlier of word, determined by IQR (median + 1.5 * Standard > word frequency > median - 1.5 * Standard) LowIQR: only analyze the Left outlier of word, determined by IQR (median - 1.5 * Standard > word frequency) :param Low: this method will only analyze the word with higher frequency than this value (this parameter will be overwritten if the option is not 'Custom') :param High: this method will only analyze the word with lower frequency than this value (this parameter will be overwritten if the option is not 'Custom') :return: a sorted dict (list of tuples) that the first element of the word and the second element is it corresponding p value """ # begin handle options MergeList = merge_list(WordLists) TotalWordCount = sum(MergeList.values()) NumWord = len(MergeList) High, Low = wordfilter(option, Low, High, NumWord, TotalWordCount, MergeList) # end handle options Len = max(len(matrix) for matrix in Matrixs) # the length of all the sample set (all the sample set with less that this will turn into a masked array) word_pvalue_dict = {} # the result list for i in range(1, len(Matrixs[0][0])): # focusing on a specific word word = Words[i - 1] try: MergeList[word] except KeyError: continue if Low < MergeList[word] < High: samples = [] for k in range(len(Matrixs)): # focusing on a group sample = [] for j in range(len(Matrixs[k])): # focusing on all the segment of that group # add the sample into the sample list sample.append(Matrixs[k][j][i]) # combine all the samples of each sample list # turn the short ones masked so that all the sample set has the same length samples.append(ma.masked_array(sample + [0] * (Len - len(sample)), mask=[0] * len(sample) + [1] * (Len - len(sample)))) # do the KW test try: pvalue = kruskalwallis(samples)[1] except ValueError as error: if error.args[0] == 'All numbers are identical in kruskal': # get the argument of the error pvalue = 'Invalid' else: raise ValueError(error) # put the result in the dict word_pvalue_dict.update({word.decode('utf-8'): pvalue}) return sorted(word_pvalue_dict.items(), key=itemgetter(1))
# open output # output = open(args["index"], "w") kw = [] # loop over images # for imagePath in glob.glob(args["dataset"] + "/*.jpg"): for imagePath in glob.glob(dataset + "/*.jpg"): imageID = imagePath[imagePath.rfind("/") + 1:] image = cv2.imread(imagePath) # describe image features = cd.describe(image) temp = kruskalwallis(features) kw.append((imageID, temp[0], temp[1])) # kw = [str(f) for f in kw] # output.write("%s, %s\n" % (imageID, ",".join(kw))) # features = [str(f) for f in features] # output.write("%s, %s\n" % (imageID, ",".join(features))) # output = open(args["index"], "w") # # order by # kw = sorted(kw, key=lambda l:l[1]) # with open(args["index"], 'wb') as f: # wr = csv.writer(f) # wr.writerows(kw)
def KWtest(Matrixs, Words, WordLists, option='CustomP', Low=0.0, High=1.0): """ give the kruskal wallis test result on the topword :param Matrixs: every element is a group Matrix that contain the word counts, each represent a segement. :param Words: all the words (Matrixs and words are parallel) :param WordLists: a list of dictionary that has the word map to its word count. each dictionary represent the information inside a segment :param option: some default option to set for High And Low(see the document for High and Low) 1. using standard deviation to find outlier TopStdE: only analyze the Right outlier of word, determined by standard deviation (word frequency > average + 2 * Standard_Deviation) MidStdE: only analyze the Non-Outlier of word, determined by standard deviation (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation) LowStdE: only analyze the Left Outlier of word, determined by standard deviation (average - 2 * Standard_Deviation > word frequency) 2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED* TopIQR: only analyze the Top outlier of word, determined by IQR (word frequency > median + 1.5 * Standard) MidIQR: only analyze the non-outlier of word, determined by IQR (median + 1.5 * Standard > word frequency > median - 1.5 * Standard) LowIQR: only analyze the Left outlier of word, determined by IQR (median - 1.5 * Standard > word frequency) :param Low: this method will only analyze the word with higher frequency than this value (this parameter will be overwritten if the option is not 'Custom') :param High: this method will only analyze the word with lower frequency than this value (this parameter will be overwritten if the option is not 'Custom') :return: a sorted dict (list of tuples) that the first element of the word and the second element is it corresponding p value """ # begin handle options MergeList = merge_list(WordLists) TotalWordCount = sum(MergeList.values()) NumWord = len(MergeList) High, Low = wordfilter(option, Low, High, NumWord, TotalWordCount, MergeList) # end handle options Len = max(len(matrix) for matrix in Matrixs) # the length of all the sample set (all the sample set with less that this will turn into a masked array) word_pvalue_dict = {} # the result list for i in range(1, len(Matrixs[0][0])): # focusing on a specific word word = Words[i - 1] try: MergeList[word] except KeyError: continue if Low < MergeList[word] < High: samples = [] for k in range(len(Matrixs)): # focusing on a group sample = [] for j in range(len(Matrixs[k]) ): # focusing on all the segment of that group # add the sample into the sample list sample.append(Matrixs[k][j][i]) # combine all the samples of each sample list # turn the short ones masked so that all the sample set has the same length samples.append( ma.masked_array(sample + [0] * (Len - len(sample)), mask=[0] * len(sample) + [1] * (Len - len(sample)))) # do the KW test try: pvalue = kruskalwallis(samples)[1] except ValueError as error: if error.args[ 0] == 'All numbers are identical in kruskal': # get the argument of the error pvalue = 'Invalid' else: raise ValueError(error) # put the result in the dict word_pvalue_dict.update({word.decode('utf-8'): pvalue}) return sorted(word_pvalue_dict.items(), key=itemgetter(1))
def classifiers(self): # Get performance data. A less busy person would put this in its own function. if not self.perfsCalculated: self.calculatePerformances() accels = map(lambda x: x["averageAccel"], self.novicePerfs + self.intermediatePerfs + self.expertPerfs) durations = map(lambda x: x["duration"], self.novicePerfs + self.intermediatePerfs + self.expertPerfs) smooths = map(lambda x: x["motionSmoothness"], self.novicePerfs + self.intermediatePerfs + self.expertPerfs) distances = map(lambda x: x["angularDist"], self.novicePerfs + self.intermediatePerfs + self.expertPerfs) ambidexterities = map( lambda x: x["ambidextricity"][0], self.novicePerfs + self.intermediatePerfs + self.expertPerfs ) # Make feature and target vectors X = np.array(zip(accels, durations, smooths, distances, ambidexterities)) y = np.array( [0] * 10 + [0.25, 1.0 / 3, 1.0 / 3, 0.5, 0.5, 0.5, 2, 2, 2, 2] + [6, 6, 6, 8, 8, 8, 10, 10, 10, 6] ) # Experience levels in years # y = np.array([0]*10 + [1]*10 + [2]*10) """ paired = zip(X,y) shuffle(paired) X,y = map(np.array, zip(*paired)) """ def resolveClass(y): if y <= 0.1: return "r" if y <= 2: return "g" else: return "b" # Cross validation all_colours = [] all_regressions = [] indexes = [] skf = cross_validation.StratifiedKFold(map(resolveClass, y), n_folds=10) for train_index, test_index in skf: indexes.extend(test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] linearRegressor = LinearRegression(normalize=True).fit(X_train, y_train) regressions = map(linearRegressor.decision_function, X_test) class_colours = map(resolveClass, y_test) all_regressions.extend(regressions) all_colours.extend(class_colours) print linearRegressor.intercept_, linearRegressor.coef_ """ # Plot classes = zip(regressions, class_colours) plt.figure() plt.title("Least Squares Fitted Performace") plt.xlabel("Trials") plt.ylabel("Score") zippedAndSorted = sorted(zip(regressions,class_colours)) unzipped = zip(*zippedAndSorted) plt.scatter(range(len(unzipped[0])), unzipped[0], c=unzipped[1], s=60) nov = plt.scatter([], [], color='r') inter = plt.scatter([], [], color='g') exp = plt.scatter([], [], color='b') plt.legend((nov,inter,exp),['Novice','Intermediate','Expert'], loc=2) plt.xticks( [] ) #plt.gca().set_xlim(-1,15) #plt.gca().set_ylim(-4,6) plt.tight_layout() plt.show() """ # Make test and training set # X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.5) # Fit linear regressor to weights # linearFit = LinearRegression().fit(X_train,y_train) # regressions = map( lambda x: linearFit.intercept_ + sum(np.array(x) * linearFit.coef_), X_test) # colours = map( resolveClass, y_test ) classes = zip(all_regressions, all_colours) anova_perfs = kruskalwallis( [t[0] for t in classes if t[1] == "r"], [t[0] for t in classes if t[1] == "g"], [t[0] for t in classes if t[1] == "b"], ) fig = plt.figure() ax = fig.add_subplot(111) plt.title("Linear Performance Score") plt.xlabel("Trials") plt.ylabel("Score") # Sort by value all_regressions, all_colours = zip(*sorted(zip(all_regressions, all_colours))) # Plot each experience level with different markers and colours nov_data = [(i, d) for (i, d, c) in zip(range(len(all_regressions)), all_regressions, all_colours) if c == "r"] inter_data = [ (i, d) for (i, d, c) in zip(range(len(all_regressions)), all_regressions, all_colours) if c == "g" ] exp_data = [(i, d) for (i, d, c) in zip(range(len(all_regressions)), all_regressions, all_colours) if c == "b"] nov = ax.scatter(zip(*nov_data)[0], zip(*nov_data)[1], color="r", marker="o", s=60) inter = ax.scatter(zip(*inter_data)[0], zip(*inter_data)[1], color="g", marker="^", s=60) exp = ax.scatter(zip(*exp_data)[0], zip(*exp_data)[1], color="b", marker="*", s=60) plt.legend((nov, inter, exp), ["Novice", "Intermediate", "Expert"], loc=2) plt.xticks([]) # plt.gca().set_xlim(-1,15) # plt.gca().set_ylim(-4,6) plt.tight_layout() with open("/Users/robertevans/repos/minf/keyhole_graphs/linFit.png", "w") as figOut: plt.savefig(figOut) plt.figure() plt.title("Kruskal-Wallis p-value: {0:.3g}".format(anova_perfs[1])) plt.ylabel("Score") plt.boxplot( [ [t[0] for t in classes if t[1] == "r"], [t[0] for t in classes if t[1] == "g"], [t[0] for t in classes if t[1] == "b"], ] ) plt.xticks(range(1, 4), ("Novices", "Intermediates", "Experts")) # plt.gca().set_ylim(0,9) plt.tight_layout() with open("/Users/robertevans/repos/minf/keyhole_graphs/linFit_box.png", "w") as figOut: plt.savefig(figOut) plt.show()
# Determine the settings from the filename problem, dup, ordering, nodes, mut, seed = base.split('_') with open_file_method(filename)(filename, 'r') as f: data = json.load(f) version = dup, ordering, nodes, mut if (dup, ordering) == ('skip', 'normal'): control_group = version statify[version].append(data[1]['evals']) active[version].append(data[1]['phenotype']) best = data[1]['bests'][-1] test = data[1]['test_inputs'] individual = Individual.reconstruct_individual(best, test) simplified = individual.new(Individual.simplify) reduced[version].append(len(simplified.active)) filecount += 1 except ValueError: print(filename, "FAILED") # Kruskal's requires a rectangular matrix rect = make_rectangular(list(statify.values()), 10000001) print('Files Successfully Loaded', filecount) print('Kruskal Wallis', kruskalwallis(rect)) for version, data in statify.items(): print('--------- %s ---------' % str(version)) print("MES, MAD", median_deviation(data)) print('Active', median_deviation(active[version])) print('Reduced', median_deviation(reduced[version])) print('Mann Whitney U against Control', end=' ') print(mannwhitneyu(statify[control_group], data))
def kruskal_wallis(reduced_dataframe, populations_prefix='', populations=['AFR', 'AMR', 'EAS', 'SAS', 'NFE', 'FIN']): """Calculate H-statsistic and p-value using Kruskal-Wallis test (non-parametric ANOVA).""" populations_data = [reduced_dataframe[populations_prefix + p] for p in populations] return kruskalwallis(populations_data)
def plot_statistics_pair ( mydf , feature2_name, name1 , name2, nsamples ) : if ( (feature2_name == 'Gene Expression') or (feature2_name == 'Somatic Copy Number') or (feature2_name == 'Clinical Numeric') or (feature2_name == 'MicroRNA Expression') ): label1 = name1.strip() + " (gene expression)" label2 = name2.strip() + " (" + feature2_name + ")" new_df = pd.DataFrame() new_df[label1] = pd.to_numeric( mydf['data1'] , errors='coerce') new_df[label2] = pd.to_numeric( mydf['data2'] , errors='coerce') new_df.dropna(axis = 0, how ='any', inplace = True) new_df.plot.scatter(x=label1, y=label2) print( stats.spearmanr(new_df[ label1],new_df[label2]) ) elif (feature2_name == 'Somatic Mutation t-test' ): label1 = name1.strip() + " (gene expression)" label2 = name2.strip() + " (Somatic Mutation)" mydf.rename(columns={ "data1": label1, "data2": label2 }, inplace=True) sns.violinplot( x=mydf[label2], y=mydf[label1], palette="Pastel1") print( mydf.groupby(label2).agg(['mean', 'count']) ) Set1 = mydf[mydf[label2]==0] Set2 = mydf[mydf[label2]==1] print('\nT-test statistics : ') print( stats.ttest_ind(Set1[label1], Set2[label1], equal_var=False ) ) elif (feature2_name == 'Somatic Mutation' ): label1 = name1.strip() + " (gene expression)" label2 = name2.strip() + " (Somatic Mutation)" newdf = mydf.rename(columns={ "data1": label1, "data2": label2 }) sns.violinplot( x=newdf[label2], y=newdf[label1], palette="Pastel1") # rank data print( newdf.groupby(label2).agg(['mean', 'count']) ) #Set1 = mydf[mydf[label2]==0] #Set2 = mydf[mydf[label2]==1] print('\nSpearman correlation : ') newdf['rnkdata'] = newdf[label1].rank(method='average') #average, min print( stats.pearsonr( newdf['rnkdata'] , newdf[label2] ) ) elif (feature2_name == 'Clinical Categorical' ) : new_data = mydf[ mydf.data2.str.contains('^\[.*\]$',na=True,regex=True) == False ] label1 = name1.strip() + " (gene expression)" label2 = name2.strip() + " (clinical)" new_data.rename(columns={ "data1": label1, "data2": label2 }, inplace=True) sns.violinplot( x=new_data[label2], y=new_data[label1], palette="Pastel1") print( new_data.groupby( label2 ).agg(['median', 'count']) ) CategoryData = [] CategoryNames = [] for name, group in new_data.groupby( label2 ) : data = group[ label1 ].values if ( len( data ) > nsamples ) : CategoryData.append( data ) CategoryNames.append( name ) print('\nKruskal-Wallis test for groups with more than '+ str(nsamples) +' patients : ') if len( CategoryData ) > 1 : print( mstats.kruskalwallis( *[ mydata for mydata in CategoryData ] ) ) else : print( 'Number of groups less than 2 \n') return