コード例 #1
0
ファイル: store.py プロジェクト: gree-gorey/losc
    def test(self, arr1, arr2):
        p_value = 0
        if self.statistics == "auto":
            # проверяем Левеном на равенство дисперсий. Если равны
            if stats.levene(arr1, arr2)[1] > 0.05:
                # Шапир на нормальность выборок. Если нормальные
                if stats.shapiro(arr1)[1] > 0.05 and stats.shapiro(arr2)[1] > 0.05:
                    # p = Student
                    p_value = stats.ttest_ind(arr1, arr2)[1]
                else:
                    # p = Mann
                    if equal(arr1, arr2):
                        p_value = 1
                    else:
                        p_value = stats.mannwhitneyu(arr1, arr2)[1]
            else:
                p_value = stats.ttest_ind(arr1, arr2, False)[1]

        elif self.statistics == "student":
            p_value = stats.ttest_ind(arr1, arr2)[1]
        elif self.statistics == "welch":
            p_value = stats.ttest_ind(arr1, arr2, False)[1]
        elif self.statistics == "mann":
            if equal(arr1, arr2):
                p_value = 1
            else:
                p_value = stats.mannwhitneyu(arr1, arr2)[1]
        return p_value
コード例 #2
0
def rankTest(arg):
	ou=[]
	ou.append(stats.kruskal(data[arg][1],data[arg][2],data[arg][3])[1])
	ou.append(stats.mannwhitneyu(data[arg][1],data[arg][2])[1])
	ou.append(stats.mannwhitneyu(data[arg][1],data[arg][3])[1])
	ou.append(stats.mannwhitneyu(data[arg][2],data[arg][3])[1])
	return ou
コード例 #3
0
def target_analysis(mirna2age, mirna2disease, mirna2target, gene2age):


	mir_targetdb = pd.read_csv('/Users/virpatel/Desktop/pub_stuff/relevant_data/mir_target_vectordb.txt', sep='\t',index_col=[0], encoding='utf-8')


	target_lst = list(mir_targetdb.columns.values)

	mirnanumdis = []
	mirnanumtar = []
	mir_avg_tar_age_dis = []
	mir_avg_tar_age_nondis = []
	mir_age = []

	for mir in mir_targetdb.index:
		if mir not in mirna2disease: mirnanumdis.append(0)
		else: mirnanumdis.append(len(mirna2disease[mir]))
		bintarlt = mir_targetdb.loc[mir].tolist()
		mirnanumtar.append(sum(bintarlt))
		tarages = [float(gene2age[target_lst[ind]]) for ind, a in enumerate(bintarlt) if target_lst[ind] in gene2age and a == 1]
		# mir_avg_tar_age_all.append(median(tarage))
		mir_avg_tar_age_dis.append(mean(tarages))

	for mir in mir_targetdb.index:
		if mir not in mirna2disease:
			bintarlt = mir_targetdb.loc[mir].tolist()
			tarages = [float(gene2age[target_lst[ind]]) for ind, a in enumerate(bintarlt) if target_lst[ind] in gene2age and a == 1]
			mir_avg_tar_age_nondis.append(mean(tarages))




	print mannwhitneyu(mir_avg_tar_age_dis,mir_avg_tar_age_nondis)
コード例 #4
0
def violin_nocomp(lst_for_exclusion, binary_data_frame, tipo,xentry,df_name):
	yes = []
	datalst = []
	no = []

	for alpha in binary_data_frame.index:
		if alpha in lst_for_exclusion:
			datalst.append([sum(binary_data_frame.loc[alpha].tolist()),'%s miRNAs' %(tipo)])
			yes.append(sum(binary_data_frame.loc[alpha].tolist()))
		else:
			datalst.append([sum(binary_data_frame.loc[alpha].tolist()),'Non-%s miRNAs' %(tipo)])
			no.append(sum(binary_data_frame.loc[alpha].tolist()))


	print mean(yes), mean(no)
	print median(yes), median(no)
	print mannwhitneyu(yes, no)


	data_master = pd.DataFrame(datalst,columns=[xentry, 'miRNA Class'])
	sns.violinplot(x='miRNA Class',y=xentry,data=data_master, cut=0)
	if 'tis' in df_name:
		plt.gca().set_ylim([0,20])
	if 'tar' in df_name:
		plt.gca().set_ylim([0,1000])
	plt.savefig('figures/nocomp_violin_%s.pdf' %(df_name),bbox_inches='tight')
	plt.close()
コード例 #5
0
ファイル: ABTestRunner.py プロジェクト: boskaiolo/A_B_tests
def mann_whitneyu(data, alternative):
    """Mann Whitneyu 's U-test, for 2 groups of samples with any length. It tests whether
    the two groups come from the same population, any distributed.

    Args:
        data (List[numpy.array]): each element of the list is an array of observations
        alternative (String): Whether uA, should be >, < or <> than uB

    Note:
        * It's a non-parametric test

    """

    if len(data) != 2:
        raise ValueError("2 groups are needed")

    a = data[0]
    b = data[1]

    if alternative == "<>":
        _, p = stats.mannwhitneyu(a, b)
        p *= 2
    elif alternative == ">":
        _, p = stats.mannwhitneyu(a, b)
    elif alternative == "<":
        _, p = stats.mannwhitneyu(a, b)
        p = 1-p
    return p
コード例 #6
0
ファイル: pvc_raw_mappings.py プロジェクト: catfishy/jagust
def regionalEffectSizes(subj_group, data_prior, data_post, index_lookup):
    # calculate prior suvr/rank distr
    sorted_uptakes_prior = []
    group_uptakes_prior = {k:[] for k in index_lookup}
    group_ranks_prior = {k:[] for k in index_lookup}
    for rid in subj_group:
        if rid in data_prior:
            sorted_uptakes_prior.append(sorted(data_prior[rid].iteritems(), key=lambda x: x[1], reverse=True))
            for k in data_prior[rid]:
                group_uptakes_prior[k].append(float(data_prior[rid][k]))
    uptakes_prior = {k: (np.mean(v),np.std(v)) for k,v in group_uptakes_prior.iteritems()}
    # calculate prior rank distr
    for sorted_list in sorted_uptakes_prior:
        region_ranks = [k for k,v in sorted_list]
        for rank, region in enumerate(region_ranks):
            group_ranks_prior[region].append(rank)
    ranks_prior = {k: (np.mean(v),np.std(v)) for k,v in group_ranks_prior.iteritems()}

    # calculate post suvr/rank distr
    sorted_uptakes_post = []
    group_uptakes_post = {k:[] for k in index_lookup}
    group_ranks_post = {k:[] for k in index_lookup}
    for rid in subj_group:
        if rid in data_post:
            sorted_uptakes_post.append(sorted(data_post[rid].iteritems(), key=lambda x: x[1][0], reverse=True))
            for k,(v,yrs) in data_post[rid].iteritems():
                group_uptakes_post[k].append(float(v))
    uptakes_post = {k: (np.mean(v),np.std(v)) for k,v in group_uptakes_post.iteritems()}
    # calculate post rank distr
    for sorted_list in sorted_uptakes_post:
        region_ranks = [k for k,v in sorted_list]
        for rank, region in enumerate(region_ranks):
            group_ranks_post[region].append(rank)
    ranks_post = {k: (np.mean(v),np.std(v)) for k,v in group_ranks_post.iteritems()}

    # calculate effect sizes
    group_effects = {}
    for k in index_lookup:
        prior_uptake = group_uptakes_prior[k]
        post_uptake = group_uptakes_post[k]
        prior_rank = group_ranks_prior[k]
        post_rank = group_ranks_post[k]

        u_uptake, pvalue_uptake = mannwhitneyu(prior_uptake, post_uptake, use_continuity=True)
        u_max_uptake = len(prior_uptake) * len(post_uptake)
        rank_biserial_uptake = 1.0 - (2*u_uptake/u_max_uptake)

        u_rank, pvalue_rank = mannwhitneyu(prior_rank, post_rank, use_continuity=True)
        u_max_rank = len(prior_rank) * len(post_rank)
        rank_biserial_rank = 1.0 - (2*u_rank/u_max_rank)

        to_save = {'uptake_effect': {'pvalue': pvalue_uptake,
                                     'rank_biserial': rank_biserial_uptake},
                   'rank_effect': {'pvalue': pvalue_rank,
                                   'rank_biserial': rank_biserial_rank}}
        group_effects[k] = to_save

    return (uptakes_prior, ranks_prior, uptakes_post, ranks_post, group_effects)
コード例 #7
0
def target_gene_expression_analysis(mirna2age, mirna2disease,mirna2family,gene2age):
		mir_targetdb = pd.read_csv('/Users/virpatel/Desktop/pub_stuff/relevant_data/mir_target_vectordb.txt', sep='\t',index_col=[0], encoding='utf-8')
		mir_expdb = pd.read_csv('/Users/virpatel/Desktop/pub_stuff/relevant_data/exp_data_alldmir.txt', sep='\t',index_col=[0])

		family_target_hamming = []
		family_target_avg_age = []
		family_perc_dis = []

		# for fam in mirna2family:
		# 	family_vector = []
		# 	mirlst = [a for a in mirna2family[fam] if a in mir_targetdb.index]
		# 	mirdislst = [a for a in mirna2family[fam] if a in mirna2disease]
		# 	if len(mirlst) < 4: continue
		# 	if len(mirdislst) < 4: continue
		# 	for mir in mirlst:
		# 		for other_mir in mirlst:
		# 			if mir == other_mir: continue
		# 			family_vector.append(hamming(mir_targetdb.loc[mir], mir_targetdb.loc[other_mir],normalized=True))
				
		# 	family_target_hamming.append(std(family_vector))
		# 	family_target_avg_age.append(round(mean([float(mirna2age[mirna]) for mirna in mirlst if mirna in mirna2age]),1))
		# 	family_perc_dis.append(float(len(mirdislst)) / float(len(mirna2family[fam])))


		target_lst = list(mir_targetdb.columns.values)

		mirnanumdis = []
		mirnanumtar = []
		mir_avg_tar_age_dis = []
		mir_avg_tar_age_nondis = []
		mir_age = []

		for mir in mirna2disease:
			if mir in mir_targetdb.index:
				mirnanumdis.append(len(mirna2disease[mir]))
				bintarlt = mir_targetdb.loc[mir].tolist()
				mirnanumtar.append(sum(bintarlt))
				tarages = [float(gene2age[target_lst[ind]]) for ind, a in enumerate(bintarlt) if target_lst[ind] in gene2age and a == 1]
				mir_avg_tar_age_dis.append(mean(tarages))

		for mir in mir_targetdb.index:
			if mir not in mirna2disease:
				bintarlt = mir_targetdb.loc[mir].tolist()
				tarages = [float(gene2age[target_lst[ind]]) for ind, a in enumerate(bintarlt) if target_lst[ind] in gene2age and a == 1]
				mir_avg_tar_age_nondis.append(mean(tarages))




		sns.boxplot(x=mir_avg_tar_age_nondis)
		sns.plt.show()
		plt.close()


		print mannwhitneyu(mir_avg_tar_age_dis,mir_avg_tar_age_nondis)
コード例 #8
0
ファイル: store.py プロジェクト: gree-gorey/losc
    def return_test_results(self, arr1, arr2):
        test_name = ""
        p_value = 0
        t_value = 0
        levene = stats.levene(arr1, arr2)[1]
        if self.statistics == "auto":
            # проверяем Левеном на равенство дисперсий. Если равны
            if levene > 0.05:
                # Шапир на нормальность выборок. Если нормальные
                if stats.shapiro(arr1)[1] > 0.05 and stats.shapiro(arr2)[1] > 0.05:
                    # p = Student
                    test_name = "Student"
                    result = stats.ttest_ind(arr1, arr2)
                    t_value = result[0]
                    p_value = result[1]
                else:
                    # p = Mann
                    test_name = "Mann"
                    if equal(arr1, arr2):
                        t_value = None
                        p_value = 1
                    else:
                        result = stats.mannwhitneyu(arr1, arr2)
                        t_value = result[0]
                        p_value = result[1]
            else:
                test_name = "Welch"
                result = stats.ttest_ind(arr1, arr2, False)
                t_value = result[0]
                p_value = result[1]

        elif self.statistics == "student":
            test_name = "Student"
            result = stats.ttest_ind(arr1, arr2)
            t_value = result[0]
            p_value = result[1]
        elif self.statistics == "welch":
            test_name = "Welch"
            result = stats.ttest_ind(arr1, arr2, False)
            t_value = result[0]
            p_value = result[1]
        elif self.statistics == "mann":
            test_name = "Mann"
            if equal(arr1, arr2):
                t_value = None
                p_value = 1
            else:
                result = stats.mannwhitneyu(arr1, arr2)
                t_value = result[0]
                p_value = result[1]

        df = len(arr1) + len(arr2) - 2

        return [test_name, t_value, p_value, df, levene]
コード例 #9
0
ファイル: new_stats.py プロジェクト: janesma/sixonix
def determine_significance(mesa1, mesa2):
    """ Determines if two sets of values are statistically significant.

    In the best case, we can determine a normal distribution, and equal
    variance. Once determined we can use the independent t-test function if the
    values are of equal variance.  If we have normal data, but the variance is
    unequal, the welch t-test is used.
    http://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test
    http://en.wikipedia.org/wiki/Student%27s_t-test#Equal_or_unequal_sample_sizes.2C_unequal_variances

    In the case where we cannot determine normality the mann-whitney u-test is
    desired to be used, but this test is only effective when there are greater
    than 20 samples.
    http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test
    """
    # FIXME: Is it possible to determine these things with fewer samples?
    Distribution = Enum('Distribution', 'Normal, Non_normal Unknown')
    normality = Distribution.Normal
    try:
        k2, normal = stats.normaltest(mesa1)
        # FIXME: Unhardcode
        if (normal < NORMAL_CI):
            normality = Distribution.Non_normal

        k2, normal = stats.normaltest(mesa2)
        if (normal < NORMAL_CI):
            normality = Distribution.Non_normal
    except ValueError:
        normality = Distribution.Unknown

    equal_variance = is_equal_variance(mesa1, mesa2)

    if args.ttest:
        t, p = stats.ttest_ind(mesa1, mesa2, equal_var=equal_variance)
        return (p, normality == Distribution.Normal,
                "t-test" if equal_variance else "Welch's")
    elif args.mannwhitney:
        u, p = stats.mannwhitneyu(mesa1, mesa2)
        p *= 2  # We want a 2-tailed p-value
        return (p, len(mesa1) < 20 or len(mesa2) < 20, "Mann-Whitney")

    if normality == Distribution.Normal:
        error_handler='raise'
        if np.var(mesa1) == 0 and equal_variance:
            error_handler='ignore'
        with np.errstate(divide=error_handler):
            t, p = stats.ttest_ind(mesa1, mesa2, equal_var=equal_variance)
        return (p, False, "t-test" if equal_variance else "Welch's")
    else:
        u, p = stats.mannwhitneyu(mesa1, mesa2)
        p *= 2  # We want a 2-tailed p-value
        flawed = len(mesa1) < 20 or len(mesa2) < 20
        return (p, flawed, "Mann-Whitney")
コード例 #10
0
def violin_comp_rel_ratio(gen_exlus_dic, hamming_df, tipo, xentry, df_name, new_df):
	yes = []
	no = []

	datalst = []


	flipped_exlus = map_relatives(gen_exlus_dic)

	genmirtar = [str(a) for a in list(new_df.index)]




	for alpha in hamming_df.index:
		print alpha
		for beta in hamming_df.index:
			if alpha == beta: continue
			if alpha in genmirtar and beta in genmirtar:
				if alpha in flipped_exlus:
					if beta in flipped_exlus[alpha]: 
						datalst.append([float(hamming_df[alpha][beta]), '%s miRNAs' %(tipo)])
						yes.append(	float(hamming_df[alpha][beta]) / (float(100) / float(sum(new_df.loc[alpha]))))
					else:
						datalst.append([float(hamming_df[alpha][beta]), 'Non-%s miRNAs' %(tipo)])
						no.append(	float(hamming_df[alpha][beta]) / (float(100) / float(sum(new_df.loc[alpha]))))
				else:
					datalst.append([float(hamming_df[alpha][beta]), 'Non-%s miRNAs' %(tipo)])
					no.append(float(hamming_df[alpha][beta]) / (float(100) / float(sum(new_df.loc[alpha]))))







	print mean(yes), mean(no)
	print median(yes), median(no)
	print mannwhitneyu(yes, no)

	data_master = pd.DataFrame(datalst,columns=[xentry, 'miRNA Class'])

	if 'tis' in df_name:
		sns.boxplot(x='miRNA Class',y=xentry,data=data_master)
		plt.savefig('figures/comp_rel_boxplot_%s.pdf' %(df_name),bbox_inches='tight')
		plt.close()

	if 'tar' in df_name:
		sns.violinplot(x='miRNA Class',y=xentry,data=data_master, cut=0)
		plt.savefig('figures/comp_rel_violinratio_%s.pdf' %(df_name),bbox_inches='tight')
		plt.close()
コード例 #11
0
ファイル: mannu.py プロジェクト: szha/surprise-models
def evaluate(x, y):
    ds = zip(x,y)
    size = len(x)
    x,y = zip(*filter(lambda a: not(isnan(a[0])) and not(isnan(a[1])), ds))
    index = sorted(range(size), key=lambda k: y[k])
    ssize = int(floor(0.3 * size))
    return mannwhitneyu(x[:ssize],x[-ssize:])
コード例 #12
0
ファイル: report.py プロジェクト: chenshuo/benchmark
def calc_utest(timings_cpu, timings_time):
    min_rep_cnt = min(len(timings_time[0]),
                      len(timings_time[1]),
                      len(timings_cpu[0]),
                      len(timings_cpu[1]))

    # Does *everything* has at least UTEST_MIN_REPETITIONS repetitions?
    if min_rep_cnt < UTEST_MIN_REPETITIONS:
        return False, None, None

    time_pvalue = mannwhitneyu(
        timings_time[0], timings_time[1], alternative='two-sided').pvalue
    cpu_pvalue = mannwhitneyu(
        timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue

    return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue
コード例 #13
0
ファイル: ENCODE.py プロジェクト: grbot/tc9
def stats(d_lengths,dn,):

    for bool_skip in [False,True,]:

        even = []
        odd = []
        for dist_min in d_lengths.keys():
            for len_diff in d_lengths[dist_min].keys():
                if bool_skip == True:
                    if len_diff == 1: continue
                if len_diff % 2 == 0:
                    even += d_lengths[dist_min][len_diff]*[dist_min]
                else:
                    odd += d_lengths[dist_min][len_diff]*[dist_min]

        import scipy
        from scipy import stats

        u,p = stats.mannwhitneyu(even,odd)
        fd = open('stats','a')
        fd.write('mannwhitneyu u %s p %s %s %s\n' %(u,p,dn,bool_skip))
        fd.close()
        
        z,p = stats.ranksums(even,odd)
        fd = open('stats','a')
        fd.write('ranksums z %s p %s %s %s\n' %(z,p,dn,bool_skip))
        fd.close()

        average_even = sum(even)/len(even)
        average_odd = sum(odd)/len(odd)
        fd = open('stats','a')
        fd.write('average even %s odd %s %s %s\n' %(average_even,average_odd,dn,bool_skip))
        fd.close()
    
    return
コード例 #14
0
ファイル: p300_fda.py プロジェクト: BrainTech/openbci
 def compareDistributions(self, target, nontarget):
     """
     Compares two distributions with Mann-Whitney U test.
     """
     result = st.mannwhitneyu(nontarget, target)[0]
     
     return result
コード例 #15
0
def _PValues(arguments):
  """Performs a simulation of a comparison and returns the p-values.

  Starts with two normal distributions with a predetermined distance.
  Randomly pulls values from that distribution and calculates the
  running p-value as the samples grow in size, up to max_sample_size.

  Arguments:
    distance_stddev: The distance between the means of the two normal
        distributions, in multiples of the standard deviation.
    max_sample_size: The number of values to pull per sample.

  Returns:
    A list of p-values, from N=1 to N=max_sample_size.
  """
  distance_stddev, max_sample_size = arguments

  a = []
  b = []
  p_values = []
  for _ in xrange(max_sample_size):
    a.append(stats.norm.rvs())
    b.append(stats.norm.rvs(distance_stddev))
    p_values.append(stats.mannwhitneyu(a, b, alternative='two-sided').pvalue)
  return p_values
コード例 #16
0
ファイル: pvc_raw_mappings.py プロジェクト: catfishy/jagust
def regionEffectSizesBetweenGroups(group_prefix, group_one, group_two, data, index_lookup):
    # calculate group one suvr distr
    group_uptakes_one = {k:[] for k in index_lookup}
    for rid in group_one:
        if rid not in data:
            continue
        for k in data[rid]:
            group_uptakes_one[k].append(float(data[rid][k]))
    # calculate group two suvr distr
    group_uptakes_two = {k:[] for k in index_lookup}
    for rid in group_two:
        if rid not in data:
            continue
        for k in data[rid]:
            group_uptakes_two[k].append(float(data[rid][k]))
    # calculate effect sizes
    group_effects = {}
    for k in index_lookup:
        one_uptake = group_uptakes_one[k]
        two_uptake = group_uptakes_two[k]

        u_uptake, pvalue_uptake = mannwhitneyu(one_uptake, two_uptake, use_continuity=True)
        u_max_uptake = len(one_uptake) * len(two_uptake)
        rank_biserial_uptake = 1.0 - (2*u_uptake/u_max_uptake)

        to_save = {'pvalue': pvalue_uptake,'rank_biserial': rank_biserial_uptake}
        group_effects[k] = to_save
    line_data = defaultdict(dict)
    for k,v in group_effects.iteritems():
        for eff_k, eff_v in v.iteritems():
            line_data[k]['%s_uptake_effect_%s' % (group_prefix,eff_k)] = eff_v
    df = pd.DataFrame(dict(line_data)).T
    df.index.name = 'Region'
    return df
コード例 #17
0
ファイル: myutils.py プロジェクト: cxrodgers/Working-memory
def ranksum(samp1, samp2):
    ''' Calculates the U statistic and probability that the samples are from
    two different distributions.  These tests are non-parametric, so you can
    use them if your sample distributions are not Gaussian.  The null hypothesis
    for the test is that the two samples come from the same distribution, so
    if p is less than some cutoff, you can reject the null hypothesis and claim
    the samples come from different distributions, that is, one sample is ranked
    higher (for instance, larger times, higher spiking rates) than the other sample.
    
    For small sample sizes (n, m <30), the U statistic is calculated directly.
    The probability is found from a table, at the p<0.05 level.
    
    For large sample sizes (n, m >30), the U statistic and probability are
    calculated using scipy.stats.mannwhitneyu which uses a normal approximation.
    
    Parameters
    ----------
    samp1 : array like
        A 1D array of the sample data
    samp2 : array like
        A 1D array of the sample data
        
    Returns
    -------
    U : int
        The smaller U statistic of the sample
    p : float
        The probability that the null hypothesis is true.
    '''
    
    if (len(samp1) <= 30) & (len(samp2) <= 30):
        return ranksum_small(samp1, samp2)
    else:
        return mannwhitneyu(samp1, samp2)
コード例 #18
0
ファイル: stats.py プロジェクト: kaylanb/orion2_yt
def steady_state_test_rw_to_end(sink,corr=True):
	steady_pdf= approx_steady_pdf(sink,corr=corr)
	pdfs,time_LHS= moving_window_pdfs(sink,dt=0.1,wind_dt=1.,corr=corr)
	assert(pdfs.shape[0] == len(time_LHS))
	#compare pdfs
	welch= dict(t=[],p=[])
	ks=dict(t=[],p=[])
	U=dict(t=[],p=[])
	for i in range(len(time_LHS)):
		s1= pdfs[i,:] 
		if np.all( np.isfinite(s1) ) and  np.all( np.isfinite(steady_pdf) ): 
			t_p= ttest_ind(s1,steady_pdf, equal_var=False) 
			welch['t'].append( t_p[0] )
			welch['p'].append( t_p[1] )
			t_p= ks_2samp(s1,steady_pdf) 
			ks['t'].append( t_p[0] )
			ks['p'].append( t_p[1] )
			t_p= mannwhitneyu(s1,steady_pdf) 
			U['t'].append( t_p[0] )
			U['p'].append( t_p[1] )
		else: #no data found between tbeg and tbeg+twid 
			welch['t'].append( -1 )
			welch['p'].append( -1 )
			ks['t'].append( -1 )
			ks['p'].append( -1 )
			U['t'].append( -1 )
			U['p'].append( -1 )
	for key in welch.keys():
		welch[key]= np.array(welch[key])
		ks[key]= np.array(ks[key])
		U[key]= np.array(U[key])
	return time_LHS,welch,ks,U
コード例 #19
0
ファイル: minimize.py プロジェクト: dornja/goa2
    def _test( self, deltas ):
        # "Passing" behavior is more like the original (slower, more energy).
        # "Failing" behavior is more optimized (faster, less energy).

        fitness = np.array( self.get_fitness( deltas ) )
        if len( fitness ) == 0:
            return self.UNRESOLVED
        if np.any( fitness == 0 ):
            return self.UNRESOLVED
        m = np.mean( fitness, axis = 0 )
        s = np.std( fitness, axis = 0 )
        sqrtn = np.sqrt( fitness.shape[ 0 ] )
        for i in range( fitness.shape[ 1 ] ):
            infomsg( "   ", m[ i ], "+/-", 1.96 * s[ i ] / sqrtn )
        for i in range( fitness.shape[ 1 ] ):
            if np.ptp( self.optimized[ ::, i ] ) == 0 and \
                    np.ptp( fitness[ ::, i ] ) == 0 and \
                    self.optimized[ 0, i ] == fitness[ 0, i ]:
                # Optimized and fitness are all the same value, likely because
                # we are comparing the optimized variant to itself. This counts
                # as a fail, since they are clearly drawn from the same distro.
                continue
            pval = mannwhitneyu( self.optimized[ ::, i ], fitness[ ::, i ] )[ 1 ]
            if pval < options.alpha and m[ i ] < self.mean[ i ]:
                return self.PASS
        return self.FAIL
コード例 #20
0
 def fit(self, df_X, df_y):
     if not df_y.shape[0] == df_X.shape[0]:
         raise ValueError("number of regions is not equal")
     if df_y.shape[1] != 1:
         raise ValueError("y needs to have 1 label column")
     
     # calculate Mann-Whitney U p-values
     pvals = []
     clusters  =  df_y[df_y.columns[0]].unique()
     for cluster in clusters:
         pos = df_X[df_y.iloc[:,0] == cluster]
         neg = df_X[df_y.iloc[:,0] != cluster]
         p = []
         for m in pos:
             try:
                 p.append(mannwhitneyu(pos[m], neg[m], alternative="greater")[1])
             except Exception as e:
                 sys.stderr.write(str(e) + "\n")
                 sys.stderr.write("motif {} failed, setting to p = 1\n".format(m))
                 p.append(1)
         pvals.append(p)
     
     # correct for multipe testing
     pvals = np.array(pvals)
     fdr = multipletests(pvals.flatten(), 
             method="fdr_bh")[1].reshape(pvals.shape)
     
     # create output DataFrame
     self.act_ = pd.DataFrame(-np.log10(pvals.T), 
             columns=clusters, index=df_X.columns)
    def runCompare(self, objId, labelToAdd, expression1, expression2):
        fh = open(self._getPath("report.txt"),'w')

        self.experiment = self.readExperiment(self.inputExperiment.get().fnPKPD)
        x1 = [float(x) for x in self.experiment.getSubGroupLabels(self.expression1.get(),self.labelToCompare.get())]
        x2 = [float(x) for x in self.experiment.getSubGroupLabels(self.expression2.get(),self.labelToCompare.get())]
        self.doublePrint(fh,"Values in SubGroup 1: %s"%str(x1))
        self.doublePrint(fh,"Values in SubGroup 2: %s"%str(x2))
        self.doublePrint(fh,"Testing H0: mu1=mu2")
        self.doublePrint(fh," ")

        try:
            [t,pval] = stats.ttest_ind(np.asarray(x1,np.double),np.asarray(x2,np.double),True)
            self.doublePrint(fh,"T-test two independent samples (same variance): t-statistic=%f p-value=%f"%(t,pval))
        except:
            pass

        try:
            [t,pval] = stats.ttest_ind(x1,x2, False)
            self.doublePrint(fh,"T-test two independent samples (different variance, Welch's test): t-statistic=%f p-value=%f"%(t,pval))
        except:
            pass

        try:
            [u,pval] = stats.mannwhitneyu(x1, x2, True)
            self.doublePrint(fh,"Mann-Whitney U test for two independent samples: u-statistic=%f p-value=%f"%(u,pval))
        except:
            pass

        fh.close()
コード例 #22
0
ファイル: stats.py プロジェクト: kaylanb/orion2_yt
def steady_state_test_rw_to_rw(sink,dt=0.1,wind_dt=1.,corr=True):
	pdfs,time_LHS= moving_window_pdfs(sink,dt=dt,wind_dt=wind_dt,corr=corr)
	assert(pdfs.shape[0] == len(time_LHS))
	#compare pdfs
	welch= dict(t=[],p=[])
	ks=dict(t=[],p=[])
	U=dict(t=[],p=[])
	for i in range(len(time_LHS)-1):
		s1= pdfs[i,:] 
		s2= pdfs[i+1,:] 
		if np.all( np.isfinite(s1) ) and  np.all( np.isfinite(s2) ): 
			t_p= ttest_ind(s1,s2, equal_var=False) 
			welch['t'].append( t_p[0] )
			welch['p'].append( t_p[1] )
			t_p= ks_2samp(s1,s2) 
			ks['t'].append( t_p[0] )
			ks['p'].append( t_p[1] )
			t_p= mannwhitneyu(s1,s2) 
			U['t'].append( t_p[0] )
			U['p'].append( t_p[1] )
		else: #no data found between tbeg and tbeg+twid 
			welch['t'].append( -1 )
			welch['p'].append( -1 )
			ks['t'].append( -1 )
			ks['p'].append( -1 )
			U['t'].append( -1 )
			U['p'].append( -1 )
	return time_LHS[:-1],welch,ks,U
コード例 #23
0
ファイル: plots.py プロジェクト: mpschr/cnvkit
def test_loh(bins, alpha=0.0025):
    """Test each chromosome's SNP shifts and the combined others'.

    The statistical test is Mann-Whitney, a one-sided non-parametric test for
    difference in means.
    """
    # TODO - this doesn't work right if there are many shifted regions
    try:
        from scipy import stats
    except ImportError:
        # SciPy not installed; can't test for significance
        return []

    significant_chroms = []
    for chrom, partitions in iteritems(bins):
        these_shifts = np.array(partitions['thisbin'], np.float_)
        other_shifts = np.array(partitions['otherbins'], np.float_)
        if len(these_shifts) < 20:
            logging.info("Too few points (%d) to test chrom %s",
                         len(these_shifts), chrom)
        elif these_shifts.mean() > other_shifts.mean():
            logging.debug("\nThese ~= %f (N=%d), Other ~= %f (N=%d)",
                          these_shifts.mean(), len(these_shifts),
                          other_shifts.mean(), len(other_shifts))
            u, prob = stats.mannwhitneyu(these_shifts, other_shifts)
            logging.info("Mann-Whitney - %s: u=%s, p=%s", chrom, u, prob)
            if prob < alpha:
                significant_chroms.append(chrom)

    return significant_chroms
コード例 #24
0
ファイル: report.py プロジェクト: padenis/attelo
    def significance(self, fun, other, test="wilcoxon"):
        """computes stats significance of difference between two sets
        of scores test can be paired wilcoxon, mannwhitney for indep
        samples, or paired ttest.
        """
        scores1 = self.map_doc_scores(fun)
        scores2 = other.map_doc_scores(fun)
        if isinstance(scores1[0], float) or isinstance(scores1[0], int):
            pass
        else:
            # TODO: this is suspicious
            scores1 = [x for x, _ in scores1]
            scores2 = [x for x, _ in scores2]

        # differences = [(x, y) for (x, y) in zip(scores1, scores2) if x != y]
        # print(difference, file=sys.stderr)
        # print(d2, file=sys.stderr)
        # print([x for (i,x) in enumerate(d1) if x!=d2[i]], file=sys.stderr)
        assert len(scores1) == len(scores1)

        results = {}
        if test == "wilcoxon" or test == "all":
            results["wilcoxon"] = wilcoxon(scores1, scores2)[1]
        if test == "ttest" or test == "all":
            results["paired ttest"] = ttest_rel(scores1, scores2)[1]
        if test == "mannwhitney" or test == "all":
            results["mannwhitney"] = mannwhitneyu(scores1, scores2)[1]
        return results
コード例 #25
0
def mann_whitney_plus_means(turnstile_weather):
    '''
    This function will consume the turnstile_weather dataframe containing
    our final turnstile weather data. 
    
    You will want to take the means and run the Mann Whitney U-test on the 
    ENTRIESn_hourly column in the turnstile_weather dataframe.
    
    This function should return:
        1) the mean of entries with rain
        2) the mean of entries without rain
        3) the Mann-Whitney U-statistic and p-value comparing the number of entries
           with rain and the number of entries without rain
    
    You should feel free to use scipy's Mann-Whitney implementation, and you 
    might also find it useful to use numpy's mean function.
    
    Here are the functions' documentation:
    http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.mean.html
    
    You can look at the final turnstile weather data at the link below:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
    '''
    
    ### YOUR CODE HERE ###
    with_rain_data = turnstile_weather[turnstile_weather['rain'] == 1]['ENTRIESn_hourly']
    with_rain_mean = with_rain_data.mean()
    without_rain_data = turnstile_weather[turnstile_weather['rain'] == 0]['ENTRIESn_hourly']
    without_rain_mean = without_rain_data.mean()
    U, p = stats.mannwhitneyu(with_rain_data, without_rain_data)
    
    
    return with_rain_mean, without_rain_mean, U, p # leave this line for the grader
コード例 #26
0
ファイル: genre_on_time.py プロジェクト: cligs/toolbox
def do_significance_test(tpx_feature, test="Wilcoxon Ranksum"):
	"""
	Do significance testing to see if the two distributions differ significantly.
	If p <= 0.05, we are highly confident that the distributions differ significantly.
	
	Arguments:
	tpx_feature (string): Name of the temporal expression feature to test
	test (string): which test to do: Wilcoxon Ranksum or Mann Whitney U
	"""

	md_table = pd.DataFrame.from_csv(os.path.join(wdir, md_csv), header=0)
	ht_table = pd.DataFrame.from_csv(os.path.join(wdir, "tpx-corpus-counts.csv"), header=0)
	working_table = ht_table.join(md_table)

	# get data points
	data = copy.copy(working_table[tpx_feature])

	# get ids of historical novels
	idnos_hist = md_table[md_table["subgenre_hist"] == "historical"].index.tolist()
	# get ids of non-historical novels
	idnos_not_hist = md_table[md_table["subgenre_hist"] == "not_historical"].index.tolist()

	# split data into subgroups
	data_hist = data[idnos_hist]
	data_not_hist = data[idnos_not_hist]

	if test == "Mann Whitney":
		test_stat = stats.mannwhitneyu(data_hist, data_not_hist)
	else:
		# do Wilcoxon Ranksum by default
		test_stat = stats.ranksums(data_hist, data_not_hist)
	return test_stat
コード例 #27
0
 def mannwhitneyu(var1, var2):
     try:
         res = STATS.mannwhitneyu(allvals_dict[var1], allvals_dict[var2])
         print('%4s vs %s  u,p=%r => \t%s @a=10%%, %s @a=5%%'
               % (var1, var2, res, 'NE' if res[1] < .01116 else ' E', 'NE' if res[1] < .00568 else ' E'), file=sys.stderr)
     except Exception as e:
         print('%4s vs %s  failed: %r' % (var1, var2, e))
コード例 #28
0
def analyze_pairwise_mi_dict(mi_dict):
    """Given an mi_dict as returned by compute_motif_pairwise_mis,
    pretty print the results"""
    motif_width = max(j for (i,j) in mi_dict) + 1
    tests = len(mi_dict)
    positives = 0
    adjacents = 0
    positive_adjacents = 0
    obs_p_dict = {}
    adjacent_mis = []
    non_adjacent_mis = []
    for i,j in sorted(mi_dict):
        mi_obs,p_val = mi_dict[(i,j)]
        positive = p_val < 0.05
        positives += positive
        adjacent = (j == i + 1)
        adjacents += adjacent
        if adjacent:
            adjacent_mis.append(mi_obs)
        else:
            non_adjacent_mis.append(mi_obs)
        positive_adjacents += positive * adjacent
        mi_test_string = "POSITIVE" if positive else "negative"
        obs_p_dict[(i,j)] = (mi_obs,p_val)
        #print i,j,mi_obs,p_val,mi_test_string,("adjacent" if adjacent else "")
    print "Motif had width:",motif_width
    print "tests:",tests
    print "positives:",positives
    print "positive_rate:",positives/float(tests)
    print "adjacents:",adjacents
    print "positive_adjacents",positive_adjacents
    print "positive_adjacents/positives:",positive_adjacents/float(positives) if positives else 0
    print "adjacencts/tests:",adjacents/float(tests)
    print "Adjacent mis higher:",mean(adjacent_mis),mean(non_adjacent_mis),mannwhitneyu(adjacent_mis,non_adjacent_mis)
    return obs_p_dict
コード例 #29
0
ファイル: ROCalyzer.py プロジェクト: ostrokach/biskit
    def utest( self, score ):
        """
        Gives the Mann-Withney U test probability that the score is
        random.  See:

        Mason & Graham (2002) Areas beneath the relative operating
        characteristics (ROC) and relative operating levels (ROL)
        curves: Statistical significance and interpretation

        Note (1): P-values below ~1e-16 are reported as 0.0.
        See zprob() in Biskit.Statistics.stats!

        Note (2): the P-value does not distinguish between positive
        and negative deviations from random -- a ROC area of 0.1 will
        get the same P-value as a ROC area of 0.9.

        @param score: the score predicted for each item
        @type  score: [ float ]

        @return: 1-tailed P-value
        @rtype: float
        """
        sample1 = N.compress( self.positives, score )
        sample1 = sample1[-1::-1]  # invert order

        sample2 = N.compress( N.logical_not( self.positives ), score )
        sample2 = sample2[-1::-1]  # invert order

        sample1 = sample1.tolist()
        sample2 = sample2.tolist()

        p = stats.mannwhitneyu( sample1, sample2 )
        return p[1]
コード例 #30
0
ファイル: stat.py プロジェクト: jtmnf/TP2_OX_PMX
def mann_whitney(data1, data2):
    """
    non parametric
    two samples
    independent
    """
    return st.mannwhitneyu(data1, data2)
コード例 #31
0
ファイル: distribution1kbins.py プロジェクト: luzgomez/DUNE
pvarianzalev = []
pvarianzaflig = []

## pruebas estadisticas
for i in range(1, len(df.iloc[0])):

    stat, p = normaltest(df[i])  # normalidad de columna
    pvnormal.append(p)

    stat, p = normaltest(df0[i])  # normalidad de columna clase 0
    pvnormalclass0.append(p)

    stat, p = normaltest(df1[i])  # normalidad de columna clase 1
    pvnormalclass1.append(p)

    stat, p = mannwhitneyu(df0[i],
                           df1[i])  #prueba igualdad de equidistribucion
    pvdistribution.append(p)

    stat, p = ttest_ind(df0[i], df1[i])  # medias diferentes
    pvmean.append(p)

    stat, p = levene(df0[i], df1[i])  #prueba igualdad de varianza
    pvarianzalev.append(p)

    stat, p = fligner(df0[i], df1[i])  #prueba igualdad de varianza
    pvarianzaflig.append(p)

#aclaraciones
#no se usa test de shapiro por que hay indicios que dicen que
# funciona mal con muchos datos
コード例 #32
0
def makeBoxPlots(comparison, Map):
    root = '/space/jazz/1/users/gwarner/histograms/' + Map + '/'
    if comparison == 'gesiemens1000':
        files = [
            root + x for x in [
                'histdist_results_GE_manufacturers_1000_%s_data_points.txt' %
                Map,
                'histdist_results_SIEMENS_manufacturers_1000_%s_data_points.txt'
                % Map
            ]
        ]
    elif comparison == 'between':
        files = [
            root + x for x in [
                'GE_data_points.txt',
                'histdist_results_SIEMENS_%s_data_points.txt' % Map
            ]
        ]  #'siemens_data_points.txt']]
    elif comparison == '7001000':
        files = [
            root + x for x in [
                'histdist_results_SIEMENS_manufacturers_700_%s_data_points.txt'
                % Map,
                'histdist_results_SIEMENS_manufacturers_1000_%s_data_points.txt'
                % Map
            ]
        ]
    elif comparison == 'between1.5T':
        files = [
            root + x for x in [
                'histdist_results_GE_manufacturers_1.5_%s_data_points.txt' %
                Map,
                'histdist_results_SIEMENS_manufacturers_1.5_%s_data_points.txt'
                % Map
            ]
        ]
    elif comparison == 'Siemens3Tvs1.5T':
        files = [
            root + x for x in [
                'histdist_results_SIEMENS_manufacturers_1.5_%s_data_points.txt'
                % Map,
                'histdist_results_SIEMENS_manufacturers_3.0_%s_data_points.txt'
                % Map
            ]
        ]
    elif comparison == '1.5vs3.0SiemensDirsBval':
        files = [
            root + x for x in [
                'histdist_results_SIEMENS_1000_bval_30_dirs_1.5T_%s_data_points.txt'
                % Map,
                'histdist_results_SIEMENS_1000_bval_30_dirs_3.0T_%s_data_points.txt'
                % Map
            ]
        ]

#files = [root+x for x in ['histdist_results_1.5_30_Directions_1000_bval_%s_data_points.txt'%Map,'histdist_results_3.0_30_Directions_1000_bval_%s_data_points.txt'%Map]]
    elif comparison == 'GESiemensBvalFieldDirControlled':
        files = [
            root + x for x in [
                'histdist_results_GE_1000_bval_30_dirs_1.5T_%s_data_points.txt'
                % Map,
                'histdist_results_SIEMENS_1000_bval_30_dirs_1.5T_%s_data_points.txt'
                % Map
            ]
        ]
    elif comparison == 'GE6vs25DirBvalFieldControlled':
        files = [
            root + x for x in [
                'histdist_results_GE_6_Dirs_1.5T_1000_bval_%s_data_points.txt'
                % Map,
                'histdist_results_GE_25_Dirs_1.5T_1000_bval_%s_data_points.txt'
                % Map
            ]
        ]
    else:
        sys.exit('Bad comparison')
    metrics = [
        "canberra", "cityblock", "euclidean", "canberra", "chebyshev",
        "hellinger"
    ]
    data = {}
    for x in metrics:
        data[x] = {}
    for spreadsheet in files:
        f = open(spreadsheet, 'r')
        lines = f.readlines()
        oneline = ' '.join(lines).replace('\n', '')
        dataDict = ast.literal_eval(oneline)
        for x in metrics:
            vals = dataDict[x][1]  #zeroth index is x values, we want y values
            vals.sort()
            numpyarray = np.array(vals)
            data[x][spreadsheet] = numpyarray
    for metric in ['hellinger']:  #data:
        #fig = plt.figure()
        #ax = fig.add_subplot(111)
        #ax.set_ylabel('Distance', fontsize=14)
        info, labels = [], []
        Keys = data[metric].keys()
        Keys.sort()
        for f in Keys:
            info.append(data[metric][f])
            if comparison == '7001000':
                man = f.split('_')[4]
                labels.append(man + ' (n=' + str(len(data[metric][f])) + ')')
            elif comparison == 'gesiemens1000':
                man = f.split('_')[2]
                labels.append(man + ' (n=' + str(len(data[metric][f])) + ')')
            elif comparison == 'between':
                if 'SIEMENS' in f:
                    labels.append('Siemens (n=' + str(len(data[metric][f])) +
                                  ')')
                else:
                    labels.append('GE (n=' + str(len(data[metric][f])) + ')')
            elif comparison == 'between1.5T':
                man = f.split('_')[2]
                labels.append(man + ' (n=' + str(len(data[metric][f])) + ')')
            elif comparison == 'Siemens3Tvs1.5T':
                man = f.split('_')[4] + 'T'
                labels.append(man + ' (n=' + str(len(data[metric][f])) + ')')
            elif comparison == '1.5vs3.0SiemensDirsBval':
                man = f.split('_')[7]
                labels.append(man + ' (n=' + str(len(data[metric][f])) + ')')
            elif comparison == 'GESiemensBvalFieldDirControlled':
                man = f.split('_')[2]
                labels.append(man + ' (n=' + str(len(data[metric][f])) + ')')
            elif comparison == 'GE6vs25DirBvalFieldControlled':
                man = f.split('_')[3] + ' Directions'
                labels.append(man + ' (n=' + str(len(data[metric][f])) + ')')
            else:
                sys.exit('Bad comparison ', comparison)
        print '\n\n' + Map
        print comparison + ' ' + metric
        print info[0]
        print len(info[0]), len(info[1])
        #print 'Moods Median: '+str(median_test(info[0], info[1]))
        #print 'Medians (left, right)'
        median1, median2 = np.median(info[0]), np.median(info[1])
        print str(labels[0]) + ': ' + str(median1)
        print str(labels[1]) + ': ' + str(median2)
        #print 'Moods p-value'
        #pprint ([(i, median_test(info[0],info[1][:i])[1])
        #for i in range(200,len(info[1]),200)])
        #print median_test(info[0],info[1])
        print 'Mann-Whitney U: ' + str(mannwhitneyu(info[0], info[1]))
        pprint([(i, mannwhitneyu(info[0], info[1][:i])[1])
                for i in range(200, len(info[1]), 200)])

        m = plt.boxplot(x=info, labels=labels)
        whiskers = [item.get_ydata() for item in m['whiskers']]
        print 'Whiskers for ' + str(labels[0])
        med0 = np.median(info[0])
        med1 = np.median(info[1])
        print whiskers[:2]
        print 'Whiskers for ' + str(labels[1])
        print whiskers[-2:]
        print 'Max minus median for ' + str(
            labels[0]) + ': ' + str(float(whiskers[1][1]) - float(median1))
        print 'Max minus median for ' + str(
            labels[1]) + ': ' + str(float(whiskers[3][1]) - float(median2))
        sys.exit()
        plt.xticks(fontsize=14)
        plt.yticks(fontsize=14)
        if comparison == 'gesiemens1000':
            plt.title('GE 1000 B-value vs Siemens 1000 B-value ' +
                      metric.title() + ' ' + Map)
            plt.savefig('/space/jazz/1/users/gwarner/boxplots/' + Map +
                        '/Siemens_vs_GE_1000_bval_' + metric + '_' + Map +
                        '_boxplot.png')
        elif comparison == '7001000':
            plt.title('Siemens 1000 B-value vs Siemens 700 B-value ' +
                      metric.title() + ' ' + Map)
            plt.savefig('/space/jazz/1/users/gwarner/boxplots/' + Map +
                        '/Siemens_700_vs_Siemens_1000_bval_' + metric + '_' +
                        Map + '_boxplot.png')
        elif comparison == 'between1.5T':
            plt.title('GE 1.5T vs Siemens 1.5T ' + metric.title() + ' ' + Map)
            plt.savefig('/space/jazz/1/users/gwarner/boxplots/' + Map +
                        '/GE_1.5T_vs_Siemens_1.5T_' + metric + '_' + Map +
                        '_boxplot.png')
        elif comparison == 'Siemens3Tvs1.5T':
            plt.title('Siemens 1.5T vs Siemens 3.0T ' + metric.title() + ' ' +
                      Map)
            plt.savefig('/space/jazz/1/users/gwarner/boxplots/' + Map +
                        '/Siemens_1.5T_vs_Siemens_3.0T_' + metric + '_' + Map +
                        '_boxplot.png')
            #		elif comparison == '1.5vs3.0SiemensDirsBval':
            #			plt.title('Siemens 1.5T 30 Gradient Directions 1000 B-Value vs\nSiemens 3.0T 30 Gradient Directions 1000 B-Value '+metric.title()+' '+Map)
            plt.savefig(
                '/space/jazz/1/users/gwarner/boxplots/' + Map +
                '/Siemens_1.5T_30_Directions_1000_Bval_vs_Siemens_3.0T_30_Directions_1000_Bval_'
                + metric + '_' + Map + '_boxplot.png')
        elif comparison == 'GESiemensBvalFieldDirControlled':
            plt.title(
                'GE vs Siemens 1000 B-Value 30 Gradient Directions 1.5T ' +
                metric.title() + ' ' + Map)
            plt.savefig(
                '/space/jazz/1/users/gwarner/boxplots/' + Map +
                '/GE_vs_Siemens_1000_B-Value_30_Gradient_Directions_1.5T_' +
                metric + '_' + Map + 'boxplot.png')
        elif comparison == 'GE6vs25DirBvalFieldControlled':
            plt.title(
                'GE 6 Gradient Directions 1000 b-Value 1.5T vs\nGE 25 Gradient Directions 1000 b-Value 1.5T '
                + metric.title() + ' ' + Map)
            plt.savefig(
                '/space/jazz/1/users/gwarner/boxplots/' + Map +
                '/GE_6_Gradient_Directions_1000_Bval_1.5T_vs_GE_25_Gradient_Directions_1000_Bval_1.5T_'
                + metric + '_' + Map + '_boxplot.png')
        else:
            plt.title('All GE vs All Siemens ' + metric.title() + ' ' + Map)
コード例 #33
0
xt, _ = stats.boxcox(iris["Petal.Length"])
stats.shapiro(xt)

# Visualize the difference
fig = plt.figure()
ax1 = fig.add_subplot(211)
prob = stats.probplot(x, dist=stats.norm, plot=ax1)
ax1.set_xlabel('')
ax1.set_title('Prob plot against normal distribution')  # Log transformed

ax2 = fig.add_subplot(212)
prob = stats.probplot(xt, dist=stats.norm, plot=ax2)
ax2.set_title('Prob plot after BoxCox transformation')

# Q40 - Test whether median temp of beaver1 and median temp of beaver2 are equal or not.
u1, p_value1 = stats.mannwhitneyu(beaver1.temp, beaver2.temp)
print("Test statistic: ", u1, "\np-value: ", p_value1)

# Q41 -  Single loop on iris data

datalist = []  # note that this is a list and not a dataframe

# subset the data by getting columns you need from original data d2
d9 = iris[["Species", "Petal.Length"]]
ds = [rows for _, rows in d9.groupby('Species')]

# I just wanted to see my list nicely to analyze the list. You could just use ds, and it will do it.

pprint(ds)

len(ds)
コード例 #34
0
total = list(freeze_figure['stim'])
condition = list(freeze_figure['condition'])
ax = sns.swarmplot(x="condition", y="stim", data=freeze_figure, color='black')
ax.set_title('Percent freezing during stimulus (all animals included)')
ax = sns.boxplot(x=condition, y=total, palette="Set2", showfliers=False)
#ax.legend(bbox_to_anchor=(1,1))

#%%

freeze_t_shock = freeze_t_shock.T
freeze_t_loom = freeze_t_loom.T
freeze_tl_filtered = freeze_tl_filtered.T
freeze_tone = freeze_tone.T
#%%

T_SvsT = ss.mannwhitneyu(freeze_t_shock['stim'], freeze_tone['stim'])
T_SvsT_L = ss.mannwhitneyu(freeze_t_shock['stim'], freeze_t_loom['stim'])

TvsT_L = ss.mannwhitneyu(freeze_tone['stim'], freeze_t_loom['stim'])

#%%
print('T_SvsT =', T_SvsT)
print('T_SvsT_L =', T_SvsT_L)
print('TvsT_L =', TvsT_L)

#%%

freeze_figure_fil = pd.concat(
    [freeze_tl_filtered, freeze_t_shock, freeze_tone], axis=0)

freeze_figure_fil[
コード例 #35
0
ファイル: MethylC.py プロジェクト: RitataLU/MethylC-analyzer
def Find_DMR2(context, cutoff, test_method):
    #    union=pd.read_csv('Unionsite.txt',sep='\t',na_values='-')
    #    expgroup = samples[samples[2] == 'WT'][0].to_list()
    #    ctrlgroup = samples[samples[2] == 'met1'][0].to_list()
    chrs = union['chr'].unique()
    data_holder = []
    positions = []
    meanMaths = []
    pvalues = []
    for chromosome in chrs:
        subset = union[(union['context'] == context)
                       & (union['chr'] == chromosome)]
        maxPos = subset['pos'].max()
        bins = range(0, maxPos, region)
        groups = subset.groupby(pd.cut(subset['pos'], bins))
        for sRange, sValues in groups:
            minDepth = sValues.iloc[:, 3:].count().min()
            if minDepth >= qualifiedSite:
                expValue = sValues.loc[:, expgroup]
                expValue2 = [
                    x for sublist in expValue.values for x in sublist
                    if math.isnan(x) == False
                ]
                ctrlValue = sValues.loc[:, ctrlgroup]
                ctrlValue2 = [
                    x for sublist in ctrlValue.values for x in sublist
                    if math.isnan(x) == False
                ]
                pKS = stats.kstest(expValue2, ctrlValue2)[1]
                pMWU = stats.mannwhitneyu(expValue2, ctrlValue2)[1]
                meanMeth = sValues.iloc[:, 3:].astype(float).mean()
                meanMeth2 = [("%.3f" % x) for x in meanMeth]
                meanMeth3 = meanMeth + [
                    random.random() * 0.00001 for x in meanMeth
                ]
                pTTest = 1.0 if meanMeth.sum() == 0 else stats.ttest_ind(
                    meanMeth3.loc[expgroup], meanMeth3.loc[ctrlgroup])[1]
                methMeth = sValues.iloc[:, 3:].astype(float).mean().tolist()
                start = sRange.left
                end = sRange.right
                deltaMean = meanMeth.loc[expgroup].mean(
                ) - meanMeth.loc[ctrlgroup].mean()
                out = [chromosome, start, end
                       ] + meanMeth2 + [deltaMean, pTTest, pKS, pMWU]
                data_holder.append(out)

    merge = pd.DataFrame(data_holder,
                         columns=['Chr', 'Start', 'End'] +
                         samples[0].tolist() +
                         ['DeltaMean', 'pTTest', 'pKS', 'pMWU'])
    tests_methods = {0: 'pTTest', 1: 'pKS', 2: 'pMWU'}
    pvals = merge.loc[:, tests_methods[test_method]]
    sig = merge[pvals <= pvalue]
    sig_all = sig[(sig.DeltaMean >= cutoff) | (sig.DeltaMean <= -1 * cutoff)]
    sig_all.to_csv('DMR_' + context + '_all_' + str(cutoff) + '.txt',
                   sep='\t',
                   index=False)
    sig_all[sig_all.DeltaMean > 0].to_csv('DMR_' + context + '_hyper_' +
                                          str(cutoff) + '.txt',
                                          sep='\t',
                                          index=False)
    sig_all[sig_all.DeltaMean < 0].to_csv('DMR_' + context + '_hypo_' +
                                          str(cutoff) + '.txt',
                                          sep='\t',
                                          index=False)
コード例 #36
0
ファイル: plot.py プロジェクト: hwang-happy/cohorts
def mann_whitney_plot(data,
                      condition,
                      distribution,
                      ax=None,
                      condition_value=None,
                      alternative="two-sided",
                      skip_plot=False,
                      **kwargs):
    """
    Create a box plot comparing a condition and perform a
    Mann Whitney test to compare the distribution in condition A v B

    Parameters
    ----------
    data: Pandas dataframe
        Dataframe to retrieve information from

    condition: str
        Column to use as the splitting criteria

    distribution: str
        Column to use as the Y-axis or distribution in the test

    ax : Axes, default None
        Axes to plot on

    condition_value:
        If `condition` is not a binary column, split on =/!= to condition_value

    alternative:
        Specify the sidedness of the Mann-Whitney test: "two-sided", "less"
        or "greater"

    skip_plot:
        Calculate the test statistic and p-value, but don't plot.
    """
    condition_mask = get_condition_mask(data, condition, condition_value)
    U, p_value = mannwhitneyu(
        data[condition_mask][distribution],
        data[~condition_mask][distribution],
        alternative=alternative
    )

    plot = None
    if not skip_plot:
        plot = stripboxplot(
            x=condition,
            y=distribution,
            data=data,
            ax=ax,
            significant=p_value <= 0.05,
            **kwargs
        )

    sided_str = sided_str_from_alternative(alternative, condition)
    print("Mann-Whitney test: U={}, p-value={} ({})".format(U, p_value, sided_str))
    return MannWhitneyResults(U=U,
                              p_value=p_value,
                              sided_str=sided_str,
                              with_condition_series=data[condition_mask][distribution],
                              without_condition_series=data[~condition_mask][distribution],
                              plot=plot)
コード例 #37
0
    ####### Fig 05 b #######
    complexity = pd.read_csv('saved/graph_complexity.csv', header=None).values

    coastalComplexity = complexity[coastalIndices[0], coastalIndices[1] + 1]
    coastalCompelxity = RejectOutliers(
        np.hstack((coastalComplexity, complexity[:, 0])), 2)
    landComplexity = RejectOutliers(
        complexity[landIndices[0], landIndices[1] + 1], 2)
    coralComplexity = RejectOutliers(
        complexity[coralIndices[0], coralIndices[1] + 1], 2)

    highPlanComplexity = RejectOutliers(
        complexity[highPlanIndicies[0], highPlanIndicies[1] + 4], 2)

    PCoastalL = mannwhitneyu(coastalCompelxity, landComplexity)[1]
    PCoralL = mannwhitneyu(landComplexity, coralComplexity)[1]
    PCC = mannwhitneyu(coastalCompelxity, coralComplexity)[1]

    fig, axs = plt.subplots(1, 1, figsize=(3, 7))
    plt.setp(axs.spines.values(), linewidth=2)

    yconferror = np.array([
        list(sms.DescrStatsW(coastalComplexity).tconfint_mean(0.05)),
        list(sms.DescrStatsW(landComplexity).tconfint_mean(0.05)),
        list(sms.DescrStatsW(coralComplexity).tconfint_mean(0.05))
    ])
    ysem = np.array(
        [sem(coastalComplexity),
         sem(landComplexity),
         sem(coralComplexity)])
コード例 #38
0
print(dataset.groupby('clase').size())
# Split-out validation dataset
for i in Datos:
    print i
sys.exit()
#obtengo los x que son todos los elementos menos el id y la clase
X = array[:, 1:len(nombres) - 1]
resultados = []
for i in range(len(X)):
    print "###################"
    try:
        X[i][np.isnan(X[i])] = 0
        Y[i][np.isnan(Y[i])] = 0
        Test = [["t estudent", stats.ttest_ind(X[i], Y[i])],
                ["mannwhitneyu",
                 stats.mannwhitneyu(X[i], Y[i])],
                ["kruskal", stats.kruskal(X[i], Y[i])]]
        #        print atributosX[i]
        #        print "valor clase 1 : "+str(X[i])
        #        print "valor clase 2 : "+str(Y[i])

        data_to_plot = [X[i], Y[i]]

        for nombre, test in Test:
            print "datos"
            print "para hipotesis " + str(nombre) + " los datos son :"
            stat, p = test
            print "stat : " + str(stat)
            print "p : " + str(p)
        fig = plt.figure(1, figsize=(9, 6))
        ax = fig.add_subplot(111)
コード例 #39
0


import scipy.stats as st
final_test=[165.11899394, 167.42154615, 192.18840315, 188.70493079,166.0677011 ,
200.83600747, 186.27117725, 199.42551454, 217.20806414,201.86719385,
 95.62154468, 159.49015417, 126.02467042,nan,nan,
194.70626691, 193.76326486, 181.54441614, 186.17321256,nan]
ephys_test = [149.24336357, 160.1565535 , 127.11240186, 149.11904847,
        127.45482092,
        81.52236443,  94.86429959, 145.35673907, 137.44541069,
        126.06250692,
        60.97663738,  82.70175177,  84.6588944 ,  69.37222471,
         87.05228826,
        81.77171595,  78.89516761,  40.03542436,  85.09368774,
         89.08363123,
        88.02874388, 115.49876754, 118.60709819, 134.69194744,
        117.82368202,
       108.3018049 , 136.78586032, 111.54460503, 132.41798123,
        134.87473783]

test= st.ttest_ind(final_test,ephys_test,nan_policy='omit')

test1=st.mannwhitneyu(final_test,ephys_test)


target = open(main_folder +"level_3_comparison_reaction_time.txt", 'w')
target.writelines(str(mean) +str(stds)+str(test)+ ' LEVEL 3: dst to poke / time to reward in secons (frame*120)*360 mean +- std, reaction_speed_ephys.py')

target.close()
コード例 #40
0
    if prop_inf > .5:
        ER_ep = ER_ep + 1
        ER_prop_infected.append(prop_inf)

PA_ep = 0
PA_prop_infected = []
for k in range(100):
    prop_inf = problem4a.SIRmodel(PA, [choice(PA.nodes())], beta, delta)
    if prop_inf > .5:
        PA_ep = PA_ep + 1
        PA_prop_infected.append(prop_inf)

print "ER", ER_ep
print "PA", PA_ep
print "jazz", jazz_ep
print
chi2 = stats.chi2_contingency([[ER_ep, 100 - ER_ep], [jazz_ep, 100 - jazz_ep]])
print "ER vs jazz X = ", chi2[0], "p = ", chi2[1]
print "ER mean proportion = ", np.mean(ER_prop_infected),
print "jazz mean proportion = ", np.mean(jazz_prop_infected)
mw = stats.mannwhitneyu(ER_prop_infected, jazz_prop_infected)
print "U-statistic = ", mw[0],
print "p = ", mw[1]
print
chi2 = stats.chi2_contingency([[PA_ep, 100 - PA_ep], [jazz_ep, 100 - jazz_ep]])
print "PA vs jazz X = ", chi2[0], "p = ", chi2[1]
print "PA mean proportion = ", np.mean(PA_prop_infected),
print "jazz mean proportion = ", np.mean(jazz_prop_infected)
mw = stats.mannwhitneyu(PA_prop_infected, jazz_prop_infected)
print "U-statistic = ", mw[0],
print "p = ", mw[1]
コード例 #41
0
ファイル: pvalue.py プロジェクト: genemine/IsoCell
i = 0
#deal with the problem pvalue
while i < gnumber:
    count = 0
    j = 1  #j,k means the class number
    while j <= classes:
        k = j + 1
        while k <= classes:
            if group_label.get_group(j)[i].equals(
                    other=group_label.get_group(k)[i]):
                cellpvalue.loc[i, count] = 0.5
                count = count + 1
                print(count)
            else:
                u12, pvalue = stats.mannwhitneyu(
                    group_label.get_group(j)[i],
                    group_label.get_group(k)[i])
                cellpvalue.loc[i, count] = pvalue
                count = count + 1
            k = k + 1
        j = j + 1
    i = i + 1

#cellpvalue.to_csv('cellpvalue.csv',sep=',',index=False)

pvalue = pd.DataFrame()
pvalue['targetid'] = targetid
pvalue['min_value'] = cellpvalue.min(axis=1)
pvalue = pvalue.sort_values(by="min_value", ascending=True)
pvalue.to_csv('pvalue.csv', sep=',', index=False)
コード例 #42
0
def make_significance_plot_homogeneity(X,
                                       homogeneity,
                                       Category_str,
                                       my_rosetta,
                                       thecmap='viridis',
                                       NMFCOMPS=16,
                                       save=True,
                                       filename_addon='',
                                       verbose=True,
                                       maxcats=20):
    CategoryType = X[Category_str].value_counts().keys()[0:maxcats]
    CategoryCats = X[Category_str].values[my_rosetta]

    list_of_sig = []

    for i, cat in enumerate(CategoryType[0:maxcats]):
        growthlist = [cat]
        if verbose:
            print('*****************')
            print('Category ', i, cat)
        CatCut = (CategoryCats == cat)
        growthlist += [len(CatCut[CatCut])]
        majorcount = 0
        semimajorcount = 0
        bigcount = 0
        minorcount = 0
        noncount = 0
        car = mannwhitneyu(homogeneity[CatCut],
                           homogeneity[~CatCut],
                           alternative='greater')
        if verbose:
            print('mean homogeneity of ', cat, np.mean(homogeneity[CatCut]))
            print('mean homogeneity of anti-', cat,
                  np.mean(homogeneity[~CatCut]))
        Ncats = min(maxcats, len(CategoryType[0:maxcats]))
        adjustedp = (car[1] + 1e-30) * 1 * Ncats
        growthlist += [-1 * np.log10(adjustedp)]

        list_of_sig.append(growthlist)

    colnames = [Category_str.replace(" ", ""), 'Count'] + ['A']

    CategoryChart = pd.DataFrame(list_of_sig, columns=colnames)
    CategoryChartMatrix = CategoryChart.values[:, 2:].astype(float)

    plt.clf()
    plt.figure(figsize=(len(CategoryType[0:maxcats]) * 2, 6))
    plt.imshow(CategoryChartMatrix.T, cmap=thecmap, vmin=-3, vmax=15)

    plt.xlabel(Category_str, fontsize=25)
    plt.xticks(np.arange(Ncats),
               CategoryType[0:maxcats],
               rotation='vertical',
               fontsize=25)

    cbar = plt.colorbar(fraction=0.046, pad=0.04, ticklabel_size=24)
    cbar.set_label(r'- $\log_{10} (p*$' + str(Ncats) + r'$)$', fontsize=25)
    cbar_ax = cbar.ax
    cbar_ax.tick_params(labelsize=35)
    for i in cbar_ax.get_yticklabels():
        i.set_fontsize(35)

    if (save):
        plt.savefig(filename_addon + Category_str + 'MWhom_plot.pdf')
    plt.show()

    return (CategoryChartMatrix, CategoryType[0:maxcats])
コード例 #43
0
    result = sm.OLS(y, X).fit()
    rho, pval = stats.spearmanr(df2["ARM"], df2["IMS"])

    print "TCGA spearman", i, rho, pval
    print result.summary()

    c["TCGA Leuk. Frac."] = -math.log(float(result.pvalues[1]), 10)
    p_1[i] = c

    df3 = df2[df2["TMB"].astype(float) > 0]
    if i in "6p":
        gain = df3[df3["ARM"] == 1]
        lost = df3[df3["ARM"] == -1]
        neu = df3[df3["ARM"] == 0]

        t, prob = mannwhitneyu(gain["IMS"].values, neu["IMS"].values)
        print "gain", i, t, prob
        t, prob = mannwhitneyu(lost["IMS"].values, neu["IMS"].values)
        print "loss", i, t, prob

        df4 = df3.sort(['ARM'])
        df4["ARM"] = df4["ARM"].map({-1: "loss", 1: "gain", 0: "none"})
        df4.to_csv("Analysis/TCR/6p_sm_IMS.source.txt", sep="\t")

        sns.boxplot(x="ARM", y="IMS", data=df4, palette="Set2")
        sns.swarmplot(x="ARM", y="IMS", data=df4, color=".25")
        x1, x2 = 1, 2
        y, h, col = 0.7, 0.025, 'k'
        plt.plot([x1, x1, x2, x2], [y, y + h, y + h, y], lw=1.5, c=col)
        plt.text((x1 + x2) * .5,
                 y + h,
コード例 #44
0
def make_significance_plot_WSO(X,
                               Basis,
                               Category_str,
                               my_rosetta,
                               thecmap='binary',
                               NMFCOMPS=16,
                               save=True,
                               filename_addon='',
                               PCAmode=False,
                               write_mode=False,
                               verbose=True,
                               maxcats=20):
    CategoryType = X[Category_str].value_counts().keys()[0:maxcats]
    CategoryCats = X[Category_str].values[my_rosetta]

    CategoryType = np.sort(CategoryType)

    list_of_sig = []

    for i, cat in enumerate(CategoryType[0:maxcats]):
        growthlist = [cat]
        if verbose:
            print('*****************')
            print('Category ', i, cat)
        CatCut = (CategoryCats == cat)
        growthlist += [len(CatCut[CatCut])]
        for i in range(NMFCOMPS):
            car = mannwhitneyu(Basis[:, i][CatCut],
                               Basis[:, i][~CatCut],
                               alternative='greater')
            Ncats = min(maxcats, len(CategoryType[0:maxcats]))
            adjustedp = (car[1] + 1e-30) * NMFCOMPS * Ncats
            growthlist += [-1 * np.log10(adjustedp)]
        list_of_sig.append(growthlist)

    colnames = [Category_str.replace(" ", ""), 'Count'
                ] + ['Comp' + str(i + 1) for i in range(NMFCOMPS)]

    CategoryChart = pd.DataFrame(list_of_sig, columns=colnames)
    if write_mode:
        CategoryChart.to_csv(filename_addon + Category_str + 'MWmatrix.csv',
                             sep='\t')
    CategoryChartMatrix = CategoryChart.values[:, 2:].astype(float)

    plt.clf()

    myfs = 45
    plt.figure(figsize=(35, len(CategoryType[0:maxcats]) * 2))
    plt.imshow(CategoryChartMatrix, cmap=thecmap, vmin=-3, vmax=30)
    if (PCAmode):
        plt.xlabel('Principal Component', fontsize=myfs)
    else:
        plt.xlabel('NMF component', fontsize=myfs)
    plt.ylabel(Category_str, fontsize=myfs)
    plt.yticks(np.arange(Ncats),
               CategoryType[0:maxcats],
               rotation='horizontal',
               fontsize=myfs)
    plt.xticks(np.arange(NMFCOMPS), (np.arange(NMFCOMPS) + 1).astype(str),
               rotation='vertical',
               fontsize=myfs)
    cbar = plt.colorbar(fraction=0.046, pad=0.04)
    cbar.set_label(r'- $\log_{10} (p*$' + str(NMFCOMPS) + r'$*$' + str(Ncats) +
                   r'$)$',
                   fontsize=myfs)
    cbar_ax = cbar.ax
    cbar_ax.tick_params(labelsize=myfs)
    for i in cbar_ax.get_yticklabels():
        i.set_fontsize(myfs)
    if (save):
        plt.savefig(filename_addon + Category_str + 'MWplot.pdf',
                    bbox_inches='tight')
    plt.show()

    return (CategoryChartMatrix, CategoryType[0:maxcats])
コード例 #45
0
def test_one_cell():
    path_out = '/lustre/tianlab/zhangyu/PEI/mid_data_correct/try_CTCF'

    term = 'GM12878'

    file_pre = '/lustre/tianlab/zhangyu/PEI/mid_data_correct/cell_line/' \
               'model_input/GM12878/correlation.txt'
    file_cre = os.path.join(path_out, 'cRE_GM12878.txt')
    df_pre = pd.read_csv(file_pre, sep='\t')
    abs_distance = np.abs(df_pre['distance'])
    df_pre = df_pre.loc[abs_distance > 5000, :]

    file_promoter = path_origin + \
                    '/gene/promoters.up2k.protein.gencode.v19.unique.bed'
    df_promoter = pd.read_csv(file_promoter, sep='\t', header=None)

    file_tmp = os.path.join(path_out, "input_file.tmp")
    df_tmp = df_pre.loc[:, ['gene', 'dhs_id']]
    df_tmp = pd.merge(df_tmp, df_promoter, left_on='gene', right_on=7,
                      how='inner')
    df_tmp = df_tmp.loc[:, ['gene', 'dhs_id', 0, 1, 2]]
    df_tmp[3] = df_tmp['dhs_id'].apply(
        lambda x: int(x.split(':')[-1].split('-')[0]))
    df_tmp[4] = df_tmp['dhs_id'].apply(
        lambda x: int(x.split(':')[-1].split('-')[1]))
    df_tmp[5] = df_tmp.apply(
        lambda x: x[2] if x[3] > x[2] else x[4], axis=1)
    df_tmp[6] = df_tmp.apply(
        lambda x: x[3] if x[3] > x[2] else x[1], axis=1)
    # df_tmp.loc[(df_tmp['gene'] == 'TADA2B') &
    #            (df_tmp['dhs_id'] == 'DHS<-chr4:7045626-7045765'), :]
    df_tmp_out = df_tmp.loc[:, [0, 5, 6, 'gene', 'dhs_id']]
    df_tmp_out.to_csv(file_tmp, sep='\t', header=None, index=None)

    file_cre_ctcf = os.path.join(path_out, "cRE.CTCF.tmp")
    os.system(f"grep -w 'Insulator' {file_cre} > {file_cre_ctcf}")
    file_ctcf_tmp = os.path.join(path_out, "CTCF.tmp")
    os.system(f"bedtools intersect -a {file_tmp} -b {file_cre_ctcf} -wao | "
              f"cut -f 4,5,9,10,11,17 > {file_ctcf_tmp}")
    df_ctcf = pd.read_csv(file_ctcf_tmp, sep='\t', header=None, na_values='.',
                          dtype={2: 'str', 3: 'str', 4: 'float', 5: 'float'})

    def unique_ctcf(df_in):
        if df_in.shape[0] == 1:
            if np.isnan(df_in.iloc[0, 5]):
                df_in.iloc[0, 5] = 0
                df_out = df_in.loc[:, [0, 1, 5]]
            else:
                df_out = df_in.loc[:, [0, 1, 5]]
        else:
            max_ctcf = np.max(df_in[5])
            # df_out = df_in.loc[df_in[5] == max_ctcf, [0, 1, 4, 5]]
            df_out = df_in.loc[df_in[5] == max_ctcf, [0, 1, 5]]

        return df_out

    df_uniq = df_ctcf.groupby([0, 1]).apply(unique_ctcf)
    df_uniq.index = list(range(df_uniq.shape[0]))
    # df_uniq.columns = \
    #     ['gene', 'dhs_id', 'score_dhs_insulator', 'score_ctcf_insulator']
    df_uniq.columns = ['gene', 'dhs_id', 'score_ctcf_insulator']
    df_uniq = df_uniq.drop_duplicates()
    df_genome_ctcf = pd.merge(df_pre, df_uniq, on=['gene', 'dhs_id'],
                              how='left')
    df_genome_ctcf = df_genome_ctcf.fillna(0)

    os.remove(file_tmp)
    os.remove(file_cre_ctcf)
    os.remove(file_ctcf_tmp)

    file_out = os.path.join(path_out, f"{term}_input_file.txt")
    df_genome_ctcf.to_csv(file_out, sep='\t', index=None, na_rep='NA')

    file_corr = file_out
    file_label = os.path.join(path_label, f"{term}/{term}.txt")
    file_fea_label = os.path.join(path_out, f"{term}_feature_label.txt")
    file_res = os.path.join(path_out, f"{term}_result.txt")
    label = term

    df_corr = pd.read_csv(file_corr, sep='\t')
    df_label = pd.read_csv(file_label, sep='\t')
    # only select distal enhancer
    df_corr = \
        df_corr.loc[df_corr['type_cre'] != 'Protein-Promoter(Enhancer)', ]
    # if df_label.shape[0] == 0:
    #     return
    df_label['label'] = np.full(df_label.shape[0], 1)

    df_combine = pd.merge(
        df_corr, df_label, how='left',
        on=['gene', 'dhs_id', 'type_cre', 'ref_dhs_id'])
    df_combine = df_combine.fillna(0)

    # first step
    array_pred = np.full(df_combine.shape[0], 0)
    array_pred[df_combine['score_ctcf_insulator'] <= 0] = 1
    df_combine['pred'] = array_pred
    df_combine.to_csv(file_fea_label, sep='\t', index=None)
    precision = precision_score(df_combine['label'], array_pred)
    recall = recall_score(df_combine['label'], array_pred)

    cols = df_combine.columns[5:-2]
    list_res = [{'feature': 'CTCF_pred', 'correlation': '',
                 'diff_median': precision, 'pval': recall,
                 'label': label}]
    df_combine_filter = df_combine.loc[df_combine['pred'] == 1, :]
    for col in cols:
        df_sub = df_combine_filter.loc[:, [col, 'label']]
        array_pos = df_sub.loc[df_sub['label'] == 1, col]
        array_neg = df_sub.loc[df_sub['label'] == 0, col]
        try:
            _, pval = mannwhitneyu(array_pos, array_neg, alternative='greater')
        except ValueError:
            pval = np.nan
        diff_median = np.median(array_pos) - np.median(array_neg)
        # feature, corr = col.split('|')
        list_res.append({'feature': col, 'correlation': 'Spearman',
                         'diff_median': diff_median, 'pval': pval,
                         'label': label})
    df_res = pd.DataFrame(list_res)
    df_res.to_csv(file_res, sep='\t', index=False, na_rep='NA')

    return
コード例 #46
0
Question  12
'''

print_header("Question 12")

p_count = 0
index_arr_2 = np.full(len(data), -1, dtype=int )
index_i = 0

for i in range(0, len(genes)):

    datagene = data[i]
    allgene = datagene[ALL]
    amlgene = datagene[AML]

    u_stat, p_value = st.mannwhitneyu(allgene, amlgene)
    if p_value < 0.05:
        index_arr_2[index_i] = i
        index_i += 1
        p_count += 1

print(f"The Number of genes differentially expressed according to rank-sum test: {p_count}")



'''
Question 13
'''

print_header("Question 13")
コード例 #47
0
def entropy_over_time(
    entropy_df,
    tmpts=[0, 1, 2],
    use_ranksum=False,
    use_one_sided=True,
    path_to_save='/Users/jendawk/Dropbox (MIT)/C Diff Recurrence Paper/Analyses/'
):
    pval = {}
    pvals = []
    for cl in ['Recurrer', 'Non-recurrer']:
        pval[cl] = {}
        for tix, tmpt in enumerate(tmpts[:-1]):
            entropy_1 = entropy_df[tmpt]
            entropy_2 = entropy_df[tmpt + 1]
            entropy_1 = entropy_1.loc[entropy_1['Outcome'] == cl]
            entropy_2 = entropy_2.loc[entropy_2['Outcome'] == cl]
            ix_sim = set(entropy_1.index.values).intersection(
                set(entropy_2.index.values))
            if use_one_sided:
                if tmpt == 0:
                    alt = 'greater'
                else:
                    alt = 'less'
            else:
                alt = 'two-sided'
            try:
                if use_ranksum:
                    _, p = st.wilcoxon(
                        entropy_1.loc[ix_sim]['Week ' + str(tmpt)],
                        entropy_2.loc[ix_sim]['Week ' + str(tmpt + 1)],
                        alternative=alt)
                    tname = 'wilcoxnon_ranksum'
                else:
                    _, p = st.mannwhitneyu(entropy_1['Week ' + str(tmpt)],
                                           entropy_2['Week ' + str(tmpt + 1)],
                                           alternative=alt)
                    tname = 'mannwhitney'

                pval[cl][(tmpt, tmpt + 1)] = p
                pvals.append(p)
            except:
                continue
    pval_corr = {}
    pvals_corr = multipletests(pvals, alpha=0.05, method='fdr_bh')[1]
    it = 0
    for cl in ['Recurrer', 'Non-recurrer']:
        pval_corr[cl] = {}
        for tmpt in tmpts[:-1]:
            pval_corr[cl][(tmpt, tmpt + 1)] = pvals_corr[it]
            it += 1
    # p_df = pd.DataFrame(pval, index = [0]).T
    # p_df.to_csv('paper_figs/entropy_ttest.csv')
    dictionary = {'Uncorrected': pval, 'Corrected': pval_corr}
    reform = {(outerKey, innerKey): values
              for outerKey, innerDict in dictionary.items()
              for innerKey, values in innerDict.items()}
    df_over_time = pd.DataFrame(reform)
    pd.DataFrame(reform).to_csv(path_to_save +
                                'Fig3_results/entropy/intra_entropy_' + tname +
                                '_' + alt + '.csv')
    return df_over_time
コード例 #48
0
from scipy.stats import mannwhitneyu

data = np.loadtxt('/Users/KANG/geneoscopy_dev/data/20170113_nanostring_project_18731/POP_48_samples_011817_PosNormData_lit.txt', dtype=str, delimiter='\t')
samples = data[0,6:]
groups = data[1,6:]
genes = data[2:,1]
expr = np.array(data[2:,6:], dtype=float)

normal_indx = np.where(groups == "Normal")[0]
polyp_indx = np.where(groups == "Polyp")[0]
cancer_indx = np.where(groups == "Cancer")[0]

for contrast_group_indx in [np.append(polyp_indx,cancer_indx)]:
# for contrast_group_indx in [polyp_indx, cancer_indx, np.append(polyp_indx,cancer_indx)]:
	utest_results = []
	for i in range(len(genes)):
		control = expr[i,normal_indx]
		contrast = expr[i,contrast_group_indx]
		utest_stats = mannwhitneyu(control, contrast)
		fc = np.mean(contrast)/np.mean(control)
		utest_results.append([np.log2(fc), utest_stats[1]])
	utest_results = np.array(utest_results)

	indx_sorted = np.argsort(utest_results[:,1])
	# indx_sorted = np.argsort(np.abs(utest_results[:,0]))[::-1]
	out = np.hstack(( genes[indx_sorted][np.newaxis].T, utest_results[indx_sorted,:]))
	np.savetxt('/Users/KANG/geneoscopy_dev/data/20170113_nanostring_project_18731/POP_48_samples_011817_PosNormData_lit_DE_analysis.txt', out, fmt="%s", delimiter="\t")

	# print "gene", "log2FC", "p-val"

コード例 #49
0
import pandas as pd
import sys
from scipy import stats

SAMPLE_TMP = sys.argv[1]
DD = sys.argv[2]
sgRNA_tmp = sys.argv[3]
TF = sys.argv[4]

FILE_tmp = DD + '/' + sgRNA_tmp + '/' + SAMPLE_TMP + "_" + sgRNA_tmp + '_' + TF + '_raw_data.txt'
data_tmp = pd.read_table(FILE_tmp)
res_WMW = stats.mannwhitneyu(
    data_tmp['deviation'][data_tmp['sgRNA'] == 'sgNTC'],
    data_tmp['deviation'][data_tmp['sgRNA'] == sgRNA_tmp])
Pvalue_tmp = res_WMW.pvalue
print(sgRNA_tmp + '__' + TF, res_WMW.pvalue)
コード例 #50
0
             style='treatment',
             data=long_data)
plt.savefig('mean_line.svg')
plt.figure(figsize=(4, 3))
sns.lineplot(x='Time(s)',
             y='value',
             hue='treatment',
             style='repetition',
             data=long_data)
plt.savefig('all_line.svg')

## statistical analysis
# Mann Whiteney U

man_s, man_p = mannwhitneyu(
    long_data.value[long_data['treatment'] == 'Control'],
    long_data.value[long_data['treatment'] == 'IP10'])
w_s, w_p = wilcoxon(long_data.value[long_data['treatment'] == 'Control'],
                    long_data.value[long_data['treatment'] == 'IP10'])

alpha = 0.05

if w_p < alpha:
    print(
        f'The differences are statistically significant with a p value of {w_p}, we reject H0'
    )
else:
    print(
        'Both groups come from a population with the same distribution, we accept H0'
    )
コード例 #51
0
# -*- coding: utf-8 -*-

import math
import random
from scipy import stats

# Test Mood's równoci median:

stats.median_test(dane_1, dane_2)

# Test U Manna Whitney'a (nieparametryczny odpowiednik testu t-studenta dla prób niezależnych):

stats.mannwhitneyu(dane_1, dane_2)

# Test Wilcoxsona (odpowiednik testu t-studenta dla prób zależnych):

stats.wilcoxon(dane_1, dane_2)

# Test Kurskala - Wallisa (nieparametryczny odpowiednik jednoczynnikowej ANOVA dla prób niezależnych):

stats.kruskal(dane_1, dane_2, dane_3)

# Test Friedmana (nieparametryczny odpowiednik jednoczynnikowej ANOVA dla prób zależnych):

stats.friedmanchisquare(dane_1, dane_2, dane_3)
コード例 #52
0
#ListMajor = ['Mean','STD','Skewness','Kurtosis','Entropy']
#ListMinor = ['Area','MajorAxis','MinorAxis','AxesRatio','mean_R','mean_G','mean_B','Mean Distance','Max Distance','Min Distance'] 
## Check these 🚩

#for x in ListMajor:
#    for y in ListMinor:
#        labelList.append(x +'_'+ y)
labelList = ['Mean_Area', 'Mean_MajorAxis', 'Mean_MinorAxis', 'Mean_AxesRatio', 'Mean_mean_R', 'Mean_mean_G', 'Mean_mean_B', 'Mean_Mean Distance', 'Mean_Max Distance', 'Mean_Min Distance', 'STD_Area', 'STD_MajorAxis', 'STD_MinorAxis', 'STD_AxesRatio', 'STD_mean_R', 'STD_mean_G', 'STD_mean_B', 'STD_Mean Distance', 'STD_Max Distance', 'STD_Min Distance', 'Skewness_Area', 'Skewness_MajorAxis', 'Skewness_MinorAxis', 'Skewness_AxesRatio', 'Skewness_mean_R', 'Skewness_mean_G', 'Skewness_mean_B', 'Skewness_Mean Distance', 'Skewness_Max Distance', 'Skewness_Min Distance', 'Kurtosis_Area', 'Kurtosis_MajorAxis', 'Kurtosis_MinorAxis', 'Kurtosis_AxesRatio', 'Kurtosis_mean_R', 'Kurtosis_mean_G', 'Kurtosis_mean_B', 'Kurtosis_Mean Distance', 'Kurtosis_Max Distance', 'Kurtosis_Min Distance', 'Entropy_Area', 'Entropy_MajorAxis', 'Entropy_MinorAxis', 'Entropy_AxesRatio', 'Entropy_mean_R', 'Entropy_mean_G', 'Entropy_mean_B', 'Entropy_Mean Distance', 'Entropy_Max Distance', 'Entropy_Min Distance', 'Shape.FSD1', 'Shape.FSD2', 'Shape.FSD3', 'Shape.FSD4', 'Shape.FSD5', 'Shape.FSD6', 'Gradient.Mag.Mean', 'Gradient.Mag.Std', 'Gradient.Mag.Skewness', 'Gradient.Mag.Kurtosis', 'Gradient.Mag.HistEntropy', 'Gradient.Mag.HistEnergy', 'Gradient.Canny.Sum', 'Gradient.Canny.Mean', 'Haralick.ASM.Mean', 'Haralick.ASM.Range', 'Haralick.Contrast.Mean', 'Haralick.Contrast.Range', 'Haralick.Correlation.Mean', 'Haralick.Correlation.Range', 'Haralick.SumOfSquares.Mean', 'Haralick.SumOfSquares.Range', 'Haralick.IDM.Mean', 'Haralick.IDM.Range', 'Haralick.SumAverage.Mean', 'Haralick.SumAverage.Range', 'Haralick.SumVariance.Mean', 'Haralick.SumVariance.Range', 'Haralick.SumEntropy.Mean', 'Haralick.SumEntropy.Range', 'Haralick.Entropy.Mean', 'Haralick.Entropy.Range', 'Haralick.DifferenceVariance.Mean', 'Haralick.DifferenceVariance.Range', 'Haralick.DifferenceEntropy.Mean', 'Haralick.DifferenceEntropy.Range', 'Haralick.IMC1.Mean', 'Haralick.IMC1.Range', 'Haralick.IMC2.Mean', 'Haralick.IMC2.Range', 'Size.Area', 'Size.MajorAxisLength', 'Size.MinorAxisLength', 'Size.Perimeter', 'Shape.Circularity', 'Shape.Eccentricity', 'Shape.EquivalentDiameter', 'Shape.Extent', 'Shape.MinorMajorAxisRatio', 'Shape.Solidity']
failCount = 0
############# Calculate Stats ##############
for i in range(1, data1.shape[1]): ## Iterating over columns
# for i in range(51, 91): ## Iterating over columns
    list1 = data1.iloc[:,i].values
    list2 = data2.iloc[:,i].values
    try:
        stat, p = mannwhitneyu(list1, list2)
    except ValueError:
        # import pdb; pdb.set_trace()
        print('Defect in feature number '+str(i))
        continue
    statsList.append(stat)
    pList.append(p)

    # print('Statistics=%.3f, p=%.3f' % (stat, p))
    alpha = 0.05
    if p > alpha:
        # print('SAME (fail to reject H0)')
        resultList.append('Same')
        failCount +=1
    else:
#        print('Different distribution (reject H0)')
コード例 #53
0
     plt.xlabel('Complexity')
     plt.ylabel('Reward')
     plt.show()
     plt.savefig('cr.png')
 elif command.strip().lower() == 'lean':
     env = SingleCartPoleEnv()
     env.lean()
 elif command.strip().lower() == 'utest':
     '''执行Mann-Whitney U test'''
     algs = ['neat', 'hyperneat', 'dqn', 'ddqn', 'policy']
     for i, alg1 in enumerate(algs):
         algs2 = algs[i + 1:]
         for j, alg2 in enumerate(algs2):
             complex1, reward1, _ = loadcomplex(alg1, 'noreset')
             complex2, reward2, _ = loadcomplex(alg2, 'noreset')
             u_stat, p_val = stats.mannwhitneyu(reward1, reward2)
             lessthan0_05 = bool(p_val < 0.05)
             print(alg1 + '-' + alg2 + '的u_stat,pvalue为' + str(u_stat) +
                   ',' + str(p_val) + ',p值小于0.05为' + str(lessthan0_05))
 elif command.strip().lower() == 'evolvability':
     #complexityupperlimit =params['upper'] if 'upper' in params.keys() else  2000.0
     complexityupperlimit = params[
         'upper'] if 'upper' in params else 2000.0
     '''采用公式8'''
     algs = ['neat', 'hyperneat', 'dqn', 'ddqn', 'policy']
     evolvability1 = {}
     t = 0.
     for i, alg in enumerate(algs):
         complex, reward, _ = loadcomplex(alg, 'noreset')
         complex = [c for c in complex if c <= complexityupperlimit]
         reward = reward[:len(complex)]
コード例 #54
0
    axBW.plot(pos, D1PopStat, 'o', mec=colorD1, mfc='None', alpha=markerAlpha)
    medline(axBW, np.median(D1PopStat), 1, 0.5)
    axBW.set_ylabel('BW10', fontsize=fontSizeLabels)

    # tickLabels = ['nD1:Str', 'D1:Str']
    tickLabels = [
        'nD1:Str\nn={}'.format(len(nD1PopStat)),
        'D1:Str\nn={}'.format(len(D1PopStat))
    ]
    axBW.set_xticks(range(2))
    axBW.set_xlim([-0.5, 1.5])
    extraplots.boxoff(axBW)
    extraplots.set_ticks_fontsize(axBW, fontSizeTicks)
    axBW.set_xticklabels(tickLabels, fontsize=fontSizeLabels, rotation=45)

    zstat, pVal = stats.mannwhitneyu(nD1PopStat,
                                     D1PopStat)  #Nick used stats.ranksum

    messages.append("{} p={}".format(popStatCol, pVal))

    yDataMax = max([max(D1PopStat), max(nD1PopStat)])
    yStars = yDataMax + yDataMax * starYfactor
    yStarHeight = (yDataMax * starYfactor) * starHeightFactor
    plt.sca(axBW)
    starString = None if pVal < 0.05 else 'n.s.'
    extraplots.significance_stars([0, 1],
                                  yStars,
                                  yStarHeight,
                                  starMarker='*',
                                  starSize=fontSizeStars + 2,
                                  starString=starString,
                                  gapFactor=starGapFactor)
コード例 #55
0
# p2
# print(odd_df_number,
# even_df_number)
# print(odd_df_l0_number,
# odd_df_s0_number,
# even_df_l0_number,
# even_df_s0_number)
# print(odd_df_l0_number+
# odd_df_s0_number+
# even_df_l0_number+
# even_df_s0_number)
# print(df.shape[0])

# In[6]:

p3 = stats.mannwhitneyu(odd_df['search_count'], even_df['search_count']).pvalue
p4 = stats.mannwhitneyu(odd_insdf['search_count'],
                        even_insdf['search_count']).pvalue

# In[7]:


def main():
    #     searchdata_file = sys.argv[1]
    #     df = pd.read_json(searchdata_file, orient = 'records', lines = True)
    # ...

    # Output
    print(
        OUTPUT_TEMPLATE.format(
            more_users_p=p1,
コード例 #56
0
                    print('Numero de genes no esenciales que estan en CERES:', str(len(ceres_genes_linea.loc[essential_in_ceres])))
                    
                    
                    #Añades al data frame de la frecuencia de los genes esenciales, si el gen esta como esencial o no esta.
                    print('Añadiendo al df de la frecuencia de esenciales, presencia o ausencia de gen esencial en el modelo')
                    for gene in ceres_genes_linea.index:
                        if gene in essential_in_ceres:
                            df_frecuencia_esenciales.at[gene,th_l+'_'+th_u] = 1
                        else:
                            df_frecuencia_esenciales.at[gene,th_l+'_'+th_u] = 0
                    
                                                    
                    print('Mann Whitney')
                    x = ceres_genes_linea.loc[essential_in_ceres]
                    y = ceres_genes_linea.loc[non_essential_in_ceres]
                    U, p = mannwhitneyu(x, y, use_continuity=True)
                    
                            
                    #Calculas media de la expresion y del Score Ceres de los genes predichos como esenciales y su suma.
                    df_expresion_t = df_expresion.T
                    expr_genes_linea = df_expresion_t[cell_line]
                            
                    ScoreCeres_genes_predict = ceres_genes_linea.loc[essential_in_ceres]
                    Expr_genes_predict = expr_genes_linea.loc[essential_in_ceres]

                    mean_ScoreCeres_predict = ScoreCeres_genes_predict.mean()
                    mean_Expr_predict = Expr_genes_predict.mean()
                    valor_ceres_suma = ScoreCeres_genes_predict.sum()
                    
                    total_rx = len(csm2.reactions)
                    Number_essential = len(essential)
コード例 #57
0
ax.set_xticks([0, 1, 2])
ax.set_xticklabels([
    'Tagged\nN={}'.format(len(dataTagged)),
    'Close\nUntagged\nN={}'.format(len(dataCloseUntagged)),
    'Far\nUntagged\nN={}'.format(len(dataFarUntagged))
])
extraplots.boxoff(ax)

#0-1
yMin = 0
yMax = 2
yStars = [yMax * 1.1, yMax * 1.2]
yStarHeight = (yMax - yMin) * 0.05
starGapFactor = 0.1
fontSizeStars = 9
zVal, pVal = stats.mannwhitneyu(dataTagged, dataCloseUntagged)
print "{} Tagged vs. close untagged, p={}".format(feature, pVal)
if pVal < 0.05:
    extraplots.new_significance_stars([0, 0.9],
                                      yStars[0],
                                      yStarHeight,
                                      starMarker='*',
                                      fontSize=fontSizeStars,
                                      gapFactor=starGapFactor,
                                      ax=ax)
else:
    extraplots.new_significance_stars([0, 0.9],
                                      yStars[0],
                                      yStarHeight,
                                      starMarker='n.s.',
                                      fontSize=fontSizeStars,
コード例 #58
0
#%% plot enhancer architecture length per age
e_colors = ["amber", "faded green"]
e_pal = sns.xkcd_palette(e_colors)
s_colors = ["greyish", "slate grey"]
s_pal = sns.xkcd_palette(s_colors)

hue_order = ["FANTOM", "Shuffle"]
fig, (ax1) = plt.subplots(figsize=(8, 8))
order = ["Simple", "Complexenh"]
sns.barplot(y = "enh_len", x = "taxon2",\
data = enh_lens.sort_values(by = "mrca_2"), ax = ax1,\
hue = "arch",  palette = e_pal, estimator = np.median)#showfliers=False)

ms, msp = stats.mannwhitneyu(
    enh_lens.enh_len.loc[enh_lens.arch.str.contains("imple")],
    shuf_len.enh_len.loc[shuf_len.arch.str.contains("imple")])
print("simple", ms, msp)

mc, mcp = stats.mannwhitneyu(
    enh_lens.enh_len.loc[enh_lens.arch.str.contains("omplex")],
    shuf_len.enh_len.loc[shuf_len.arch.str.contains("omplex")])
print("complex", mc, mcp)
ax1.set(ylabel="Enhancer Length (bp)", ylim=(190, 400), xlabel="")
ax1.set_xticklabels(ax1.get_xticklabels(),
                    rotation=90,
                    horizontalalignment="left")

ax1.get_legend().remove()
plt.savefig("%sfig2c-Fantom_ENH_MRCA_x_LEN_ENH.pdf" % RE, bbox_inches="tight")
""" RESULTS enhancer lengths v. expected shuffle lengths for simple, complex
コード例 #59
0
df_subjects = df_cycles_burst.groupby(['group', 'subject_id']).mean()[features_keep].reset_index()
print(df_subjects)

####################################################################################################

feature_names = {'volt_amp': 'Amplitude',
                 'period': 'Period (ms)',
                 'time_rdsym': 'Rise-decay symmetry',
                 'time_ptsym': 'Peak-trough symmetry'}
for feat, feat_name in feature_names.items():
    g = sns.catplot(x='group', y=feat, data=df_subjects)
    plt.xlabel('')
    plt.xticks(size=20)
    plt.ylabel(feat_name, size=20)
    plt.yticks(size=15)
    plt.tight_layout()
    plt.show()

####################################################################################################
#
# Statistical differences in cycle features
# -----------------------------------------

####################################################################################################

for feat, feat_name in feature_names.items():
    x_treatment = df_subjects[df_subjects['group']=='patient'][feat]
    x_control = df_subjects[df_subjects['group']=='control'][feat]
    U, p = stats.mannwhitneyu(x_treatment, x_control)
    print('{:20s} difference between groups, U= {:3.0f}, p={:.5f}'.format(feat_name, U, p))
def generate_significance_dataframe():

    # classificadores
    names = []
    names.append(('RL'))
    names.append(('ADL'))
    names.append(('ADQ'))
    names.append(('KNN'))
    names.append(('NBG'))
    names.append(('NBM'))
    names.append(('SVML'))
    names.append(('SVMR'))
    names.append(('RF'))
    names.append(('ET'))
    names.append(('ENS'))

    # csv com as previsões feitas pelos classificadores (instâncias)
    # no conjunto de dados de teste
    pred = pd.read_csv("../SoccerPrediction/Results/\
pred.csv", sep=';')
    df_pred = pd.DataFrame(pred,
                           columns=[
                               'RL', 'ADL', 'ADQ', 'ET', 'KNN', 'NBG', 'NBM',
                               'RF', 'SVML', 'SVMR', 'ENS'
                           ])
    df_significance = pd.DataFrame(columns=['Class1', 'Class2', 'p'])

    # loop que itera entre todos os classificadores e gera um dataframe
    # da significancia da previsão de todos entre todos
    for name in names:

        class1 = name
        dist1 = df_pred['%s' % class1].tolist()

        for name2 in names:

            class2 = name2
            dist2 = df_pred['%s' % class2].tolist()
            u, prob = stats.mannwhitneyu(dist1, dist2, alternative='two-sided')
            df_temp = pd.DataFrame({
                'Class1': [class1],
                'Class2': [class2],
                'p': [prob]
            })
            df_significance = df_significance.append(df_temp)

    df_significance.to_csv('significance.csv', sep=';')
    # numeros foram arredondados e retirou-se a notação científica pelo excel
    df_significance = pd.read_csv("../SoccerPrediction/Results/\
significance.csv",
                                  sep=';')
    df_significance = pd.DataFrame(df_significance,
                                   columns=['Class1', 'Class2', 'p'])
    significance = df_significance.pivot('Class1', 'Class2', 'p')
    print(significance)

    f, ax = plt.subplots(figsize=(9, 6))
    sns.heatmap(significance, annot=True, linewidths=.5, ax=ax)
    plt.ylabel('Classificador 1', fontsize=16)
    plt.xlabel('Classificador 2', fontsize=16)
    plt.show()