Esempio n. 1
0
    def ANM_predict_causality(self,train_size=0.5,independence_criterion='HSIC',metric='linear'):
        '''
            Prediction of causality based on the bivariate additive noise model

            Parameters
            ----------
            independence_criterion :
                kruskal for Kruskal-Wallis H-test,
                HSIC for Hilbert-Schmidt Independence Criterion

            Returns
            -------
            Causal-direction: 1 if X causes Y, or -1 if Y causes X
        '''
        Xtrain, Xtest , Ytrain, Ytest = train_test_split(self.X, self.Y, train_size = train_size)
        #_gp = KernelRidge(kernel='rbf',degree=3)#GaussianProcess()#

        #Forward case
        #_gp.fit(Xtrain,Ytrain)
        #errors_forward = _gp.predict(Xtest) - Ytest
        _gp = pyGPs.GPR()
        _gp.getPosterior(Xtrain, Ytrain)
        _gp.optimize(Xtrain, Ytrain)
        ym, ys2, fm, fs2, lp = _gp.predict(Xtest)
        errors_forward = ym - Ytest


        #Backward case
        #_gp.fit(Ytrain,Xtrain)
        #errors_backward = _gp.predict(Ytest) - Xtest
        _gp = pyGPs.GPR()
        _gp.getPosterior(Ytrain, Xtrain)
        _gp.optimize(Ytrain, Xtrain)
        ym, ys2, fm, fs2, lp = _gp.predict(Ytest)
        errors_backward = ym - Xtest


        #Independence score

        forward_indep_pval = {
            'kruskal': kruskal(errors_forward,Xtest)[1],
            'HSIC': self.HilbertSchmidtNormIC(errors_forward,Xtest)[1]
        }[independence_criterion]

        backward_indep_pval = {
            'kruskal': kruskal(errors_backward,Ytest)[1],
            'HSIC': self.HilbertSchmidtNormIC(errors_backward,Ytest)[1]
        }[independence_criterion]

        #print 'Scores:', forward_indep_pval, backward_indep_pval

        #Warning it should be <
        if forward_indep_pval > backward_indep_pval:
            self.causal_direction = 1
            self.pvalscore = forward_indep_pval
        else:
            self.causal_direction = -1
            self.pvalscore = backward_indep_pval

        return {'causal_direction':self.causal_direction,'pvalscore':self.pvalscore,'difways':abs(forward_indep_pval-backward_indep_pval)}
Esempio n. 2
0
def gene_kruskal(dataframe, grouping, gene, just_tumors=False):
	statsummary = {}
	values = {}
	groups = []

	# Summarize statistics by group
	for key, group in dataframe.groupby(grouping):
		# print(key)
		# print(group.index.values)
		groups.append(key)
		groupstats = group[gene].describe().to_dict()
		values[key] = group[gene].tolist()
		statsummary[key] = groupstats

	# for x in statsummary:
	# 	print(x)
	# 	print("N: %d" % int(statsummary[x]['count']))
	# 	print("Median: %.3f" % statsummary[x]['50%'])
	# 	print("25th Perc: %.3f" % statsummary[x]['25%'])	
	# 	print("75th Perc: %.3f" % statsummary[x]['75%'])
	# 	print()

	# find p-values - kruskal-wallis followed by sequential independent kruskal tests with correction
	pvalues = {}
	h, p = stats.kruskal(*[values[key] for key in values])
	pvalues["Kruskal-Wallis"] = p
	# print("Kruskal-Wallis: %.3g" % p)


	#performs independent kruskal-wallis tests between all subgroups
	starter = 0
	ender = len(groups)
	k_pvalues = {}
	while starter < ender:
		counter = starter + 1
		while counter < ender:
			key = groups[starter] + " vs " + groups[counter]
			h, p = stats.kruskal(values[groups[starter]], values[groups[counter]])
			k_pvalues[key] = p
			counter += 1
		starter += 1

	#Benjamimi-Hochberg FDR correction: Pcorrected = (Poriginal * n)/k
	#http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3263024/
	adjusted_k_pvalues = {}
	for i, x in enumerate(sorted(k_pvalues, key=k_pvalues.__getitem__)):
		adjusted_k_pvalues[x] = (k_pvalues[x] * len(k_pvalues))/(i+1)

	for x in adjusted_k_pvalues:
		pvalues[x] = adjusted_k_pvalues[x]
	# 	print("%s p-value: %.3g" % (x, adjusted_k_pvalues[x]))
	# print()

	return statsummary, pvalues
Esempio n. 3
0
def kruskal_scipy_stats_tidy_df_wrapper(tidy_df,
                                        indep_var="sample_id",
                                        dep_var="mean_intensity"):
    """
    Task
    ----
    Perform kruskal wallis to determine if significant difference between groups.

    Input
    -----
    Takes tidy DataFrame, independent variable (str) and dependent variable (str).

    Returns
    -------
    statistic : float
    The Kruskal-Wallis H statistic, corrected for ties.

    pvalue : float
    The p-value for the test using the assumption that H has a chi square distribution. 
    """

    data = [
        tidy_df.loc[ids, dep_var].values
        for ids in tidy_df.groupby(indep_var).groups.values()
    ]

    return stats.kruskal(*data)
Esempio n. 4
0
def test(phenotypes, phen_dict, exp_d, phenotype_datatype, use_parametric):

    assert phenotype_datatype in ['binary', 'categorical', 'continuous'], 'Phenotype with unsupported data type'

    assert not(phenotype_datatype == 'binary' and len(phen_dict) > 2), \
        'Phenotype data type is binary but phenotype has more than two unique values'

    group_data = {}
    phen_arr = np.array(phenotypes)
    for phen in phen_dict:
        group_data[phen] = exp_d[:, phen_arr == phen]

    n_genes = exp_d.shape[0]
    z = np.zeros((n_genes, 2), dtype=np.float)
    for gene_index in range(n_genes):
        args = [np.transpose(group_data[phen][gene_index]) for phen in phen_dict]

        if use_parametric:
            if phenotype_datatype == 'binary':
                z[gene_index] = stats.ttest_ind(*args, equal_var=False)
            elif phenotype_datatype == 'categorical':
                z[gene_index] = stats.f_oneway(*args)
        else:
            if phenotype_datatype == 'binary':
                z[gene_index] = stats.mannwhitneyu(*args, alternative='two-sided')
            elif phenotype_datatype == 'categorical':
                z[gene_index] = stats.kruskal(*args)

    z1 = z[:, 1]
    z1[np.isnan(z1)] = 1
    z1 = [np.nan_to_num(v) for v in z1]

    return z1
Esempio n. 5
0
def get_num_p_value(obs_list) -> str:
    if len(obs_list) == 2:
        if check_norm_distribute(*obs_list):
            method = 'T-test'
            _, p_value = ttest_ind(obs_list[0], obs_list[1])
        else:
            method = 'Wilcoxon rank-sum'
            _, p_value = ranksums(obs_list[0], obs_list[1])
    else:
        if check_norm_distribute(*obs_list):
            method = 'One-way ANOVA'
            _, p_value = f_oneway(*obs_list)
        else:
            method = 'Kruskal-Wallis H-test'
            _, p_value = kruskal(*obs_list)
    sig_rank = ''
    if 0.01 <= p_value < 0.05:
        sig_rank = '*'
    elif p_value < 0.01:
        sig_rank = '**'
    if p_value < 0.0001:
        result = '<0.0001{} ({})'.format(sig_rank, method)
    else:
        result = '{:.4f}{} ({})'.format(float(p_value), sig_rank, method)
    return result, p_value
Esempio n. 6
0
def snapshots(data, indices,basepath=None, data_label='data'):
		indices = zip(indices,indices[1:])

		for start_idx,stop_idx in indices:
			initial_distribution = data[:,start_idx]
			final_distribution = data[:,stop_idx]

			fig = plt.figure()
			ax = fig.add_subplot(111)
			ax.hist(initial_distribution,color='r',alpha=0.5,bins=20,label='Initial', range=(-1,1))
			ax.hist(final_distribution,color='k',alpha=0.5,bins=20,label='Final',range=(-1,1))
			artist.adjust_spines(ax)
			ax.set_xlabel(artist.format(data_label))
			ax.set_ylabel(artist.format('Prevalence'))

			H,p =kruskal(initial_distribution,final_distribution)
			effect_size = np.linalg.norm(final_distribution-initial_distribution)
			ax.annotate('\Large $d=%.02f, \; p=%.04f$'%(effect_size,p), xy=(.3, .9),  
				xycoords='axes fraction', horizontalalignment='right', verticalalignment='top')
			plt.tight_layout()
			plt.legend(frameon=False)

			filename = os.path.join(basepath,'%s-compare-%d-%d.png'%(data_label,start_idx,stop_idx))
			plt.savefig(filename,dpi=300)	
			plt.close()
Esempio n. 7
0
    def _is_drifting(self):

        y_pred_range = len(self._new_values)
        n_steps = self._config.n_steps

        new_dist = np.concatenate([self._train.y.values[-n_steps+y_pred_range:], self._new_values.y.values])
        old_dist = self._train.y.values[-self._yearly_freq+y_pred_range:-self._yearly_freq+y_pred_range+n_steps]

        print("Length's: " + str(len(new_dist)) + ", " + str(len(old_dist)))
        _new = np.concatenate([self._train.index.values[-n_steps+y_pred_range:], self._new_values.index.values])
        _old = self._train.index.values[-self._yearly_freq+y_pred_range:-self._yearly_freq+y_pred_range+n_steps]
        print("Ranges: " + str(min(_new)) + " - " + str(max(_new)) + ", " + str(min(_old)) + " - " + str(max(_old)))

        stat, p = kruskal(old_dist, new_dist)

        if self._config.verbose > 1:
            print('Statistics=%.3f, p=%.3f' % (stat, p))

        alpha = 0.05 # TODO: add in class as param
        if p > alpha:
            if self._config.verbose > 1:
                print('Same distributions (fail to reject H0)')
            return False

        else:
            if self._config.verbose > 1:
                print('Different distributions (reject H0)')
            return True
Esempio n. 8
0
def solve(problem, cloning_param, mutation):
    final_data = []
    final_problem = problem(dim)
    final_mutation = mutation(mut_pb, 20)

    for x in range(repetitions):
        algorithm = CloneAlg(
            problem=final_problem,
            population_size=100,
            offspring_population_size=100,
            mutation=final_mutation,
            cloning_param=cloning_param,
            termination_criterion=StoppingByEvaluations(max_evaluations=5000))
        data = []
        dataobserver = DataObserver(1.0, data)
        algorithm.observable.register(observer=dataobserver)
        algorithm.run()
        final_data.append(data)

    trans_list = np.array(final_data).T.tolist()

    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_axes([0, 0, 1, 1])
    bp = ax.boxplot(trans_list)
    plt.title(
        "Problem: {0} benchmark, dim: {1}, cloning_param: {2}, mutation: {3}".
        format(final_problem.get_name(), dim, algorithm.get_cloning_param(),
               final_mutation.get_name()))
    plt.show()

    # Kruskal-Wallis and Dunn tests
    print(stats.kruskal(trans_list[0], trans_list[1], trans_list[-1]))
    sp.posthoc_dunn([trans_list[0], trans_list[1], trans_list[-1]],
                    p_adjust='holm')
def save_mni_kruskall_table_csv(csv_path: str,
                                experiments_list: List[ExperimentLoader],
                                alpha=0.01):

    n_snp = experiments_list[0].dataset['snapshot_count']
    mni_exp_list = [exp.get_mni_matrix() for exp in experiments_list]

    empty_str = ""
    for i in range(n_snp):

        data = [nmi[:, i] for nmi in mni_exp_list]
        try:
            _, p = kruskal(*data)

            if p <= alpha:
                # reject the null hypothesis, are not the same
                empty_str += "\u2714"
            else:
                # cannot reject the null hypothesis
                empty_str += "\u2716"
        except ValueError:
            # if all the values are the same then don't reject the null hypothesis
            empty_str += "\u2592"

    with open(csv_path, 'w') as f:
        f.write(empty_str)

    print(empty_str)
def _save_default_kruskall_hypothesis_text(
        text_path: str, experiments_matrix: List[List[ExperimentLoader]],
        labels: List[str], datasets: List[str], alpha: float,
        data_method: Callable[[List[ExperimentLoader]], List[np.array]]):

    assert len(experiments_matrix) == len(
        datasets
    ), "first dimension of experiment matrix must have the same length as datasets"
    assert len(experiments_matrix[0]) == len(
        labels
    ), "second dimension of experiment matrix must have the same length as labels"

    empty_str = ""
    for exp_list in experiments_matrix:

        data = data_method(exp_list)
        try:
            _, p = kruskal(*data)

            if p <= alpha:
                # reject the null hypothesis, are not the same
                empty_str += "\u2714"
            else:
                # cannot reject the null hypothesis
                empty_str += "\u2716"
        except ValueError:
            # if all the values are the same then don't reject the null hypothesis
            empty_str += "\u2592"

    print(empty_str)
    with open(text_path, 'w') as f:
        f.write(empty_str)
def run_stats(input_df):
    """Run Kruskal-Wallis H test. This is analogous to 1 way ANOVA but for non-parametric applications. 
	The conover test is used for post-hoc testing to determine relationship between variables. NOTE that the post hoc tests 
	should only be used when there is a significant result of the omnibus test."""

    #deal with cases where all vals in a col are nan
    input_df = input_df.dropna(axis=1, how='all')
    #set inf to nan
    input_df = input_df.replace(np.inf, np.nan)

    if input_df.isnull().all().all():
        return None
    #reformat the df cols into arrays to pass to the stats func
    data = [
        input_df[column].to_numpy() for column in input_df.columns
        if not column == 'huc8'
    ]

    #run the kruskal-wallis
    H, p = stats.kruskal(*data, nan_policy='omit')
    #print(H,p)
    try:
        #run the post-hoc test
        #conover = sp.posthoc_conover([input_df.dropna().iloc[:,0].values,input_df.dropna().iloc[:,1].values,input_df.dropna().iloc[:,2].values,input_df.dropna().iloc[:,3].values],p_adjust='holm')
        conover = sp.posthoc_conover(data, p_adjust='holm')
        conover.columns = input_df.columns
        conover.index = input_df.columns

        return H, p, conover

    except Exception as e:
        print('Error is: ', e)
Esempio n. 12
0
def summarize_he( analytical_sets ):

    results = {}
    he = {}

    for analytical_set in analytical_sets:
        he[analytical_set.label] = calculate_he(analytical_set.allele_df)

    he_df = DataFrame( he )
    labels = list(he_df.columns)
    if len(labels) == 2:
        # use Mann-Whitney / Wilcoxon test
        results['test'] = 'Wilcoxon test (paired)'
        results['stats'] = wilcoxon( he_df[labels[0]], he_df[labels[1]])

    elif len(labels) > 2:
        # use Kruskal Wallis
        results['test'] = 'Kruskal-Wallis test'
        results['stats'] = kruskal( * [he_df[x] for x in labels])
        results['warning'] = ''

    results['data'] = he_df
    results['mean'] = he_df.mean()
    results['stddev'] = he_df.std()
    #raise RuntimeError

    return results
Esempio n. 13
0
def test_KW(df, control, stats_table, col='logdwell'):

    kw = []
    pv_ori = []
    pv = []
    pv1 = []
    L = len(np.sort(df.pos.unique()))
    x = np.linspace(0, L, L)

    for pos in range(0, L):
        indx = df['pos'] == pos
        df_indx = df[indx]
        kmer = df_indx['kmer'].iloc[0]
        indx = control['kmer'] == kmer
        df_control_indx = control[indx]

        if len(df_control_indx) > 0:
            df_indx_dwell = df_indx[col]
            df_indx_dwell.reset_index(drop=True, inplace=True)
            df_control_indx_dwell = df_control_indx[col]
            df_control_indx_dwell.reset_index(drop=True, inplace=True)
            kw_results = stats.kruskal(df_indx_dwell, df_control_indx_dwell)
            kw.append(kw_results[0])
            pv_ori.append(kw_results[1])
        else:
            kw.append(0)
            pv_ori.append(0)

    stats_table['KW_' + col] = kw
    print(stats_table.head())

    return stats_table
Esempio n. 14
0
def rankTest(arg):
	ou=[]
	ou.append(stats.kruskal(data[arg][1],data[arg][2],data[arg][3])[1])
	ou.append(stats.mannwhitneyu(data[arg][1],data[arg][2])[1])
	ou.append(stats.mannwhitneyu(data[arg][1],data[arg][3])[1])
	ou.append(stats.mannwhitneyu(data[arg][2],data[arg][3])[1])
	return ou
Esempio n. 15
0
def kruskal2df(a, b):
    assert len(a) == len(b)
    ua = np.unique(a)
    b_lst = [b[np.where(a == aa)[0]] for aa in ua]
    test = stats.kruskal(*b_lst)
    res = pd.DataFrame({'stat': test[0], 'pval': test[1]}, index=[0])
    return res
Esempio n. 16
0
def kruskal_test(benchmark_snapshot_df):
    """Returns p-value for Kruskal test."""
    groups = benchmark_snapshot_df.groupby('fuzzer')
    sample_groups = groups['edges_covered'].apply(list).values

    _, p_value = ss.kruskal(*sample_groups)
    return p_value
def conover_inman_procedure(data, alpha=0.05):
    num_runs = len(data)
    num_algos = len(data.columns)
    N = num_runs * num_algos

    _, p_value = stats.kruskal(*[data[col] for col in data.columns])

    ranked = stats.rankdata(np.concatenate([data[col] for col in data.columns]))

    ranksums = []
    for i in range(num_algos):
        ranksums.append(np.sum(ranked[num_runs * i : num_runs * (i + 1)]))

    S_sq = (np.sum(ranked ** 2) - N * ((N + 1) ** 2) / 4) / (N - 1)

    right_side = stats.t.cdf(1 - (alpha / 2), N - num_algos) * math.sqrt(
        (S_sq * ((N - 1 - p_value) / (N - 1))) * 2 / num_runs
    )

    res = pd.DataFrame(columns=data.columns, index=data.columns)

    for i, j in itertools.combinations(np.arange(num_algos), 2):
        res[res.columns[i]].ix[j] = abs(ranksums[i] - ranksums[j] / num_runs) > right_side
        res[res.columns[j]].ix[i] = abs(ranksums[i] - ranksums[j] / num_runs) > right_side
    return res
Esempio n. 18
0
def getStats(tData, datasetLabels, param, labels, pNormMin, verbose=False):
    c = datasetLabels[0]
    e1 = datasetLabels[1]
    e2 = datasetLabels[2]
    statsData = []
    statsData.append(['Test and Parameter', 'p-Value', 'p-Value', 'p-Value'])
    statsData.append(
        ['', c + ' vs. ' + e1, c + ' vs. ' + e2, e1 + ' vs. ' + e2])
    for i in xrange(len(tData)):
        label = '---' + param + '_' + labels[i] + '---'
        print label
        normP = []
        for j in xrange(len(tData[i])):
            _, pValue = stats.normaltest(tData[i][j])
            normP.append(pValue)
        if min(normP) < pNormMin:
            testUsed = 'Kruskal-Wallis test'
            _, statsP = stats.kruskal(*tData[i])
            print testUsed + ' pValue:', statsP, '---'
            multiCompP = getKWmultiComp(tData[i], datasetLabels, verbose)
        else:
            testUsed = 'One Way ANOVA'
            _, statsP = stats.f_oneway(*tData[i])
            print testUsed + ' pValue:', statsP
            multiCompP = list(
                getOWANOVAmultiComp(tData[i], datasetLabels, verbose))
        statsData.append([label])
        statsData.append(['normalityTestStats'] + normP)
        statsData.append([testUsed, statsP])
        statsData.append(['MultipleComparisons p-Value'] + multiCompP)
        statsData.append([])
    return statsData
def rank_sum(x, targets, method='ranksum', cutoff=.05):
    if isinstance(targets[0], str):
        targets = (np.array(targets) == 'Recurrer').astype('float')
    else:
        targets = np.array(targets)
    pval = []
    teststat = []
    for i in range(x.shape[1]):
        xin = np.array(x)[:, i]
        X = xin[targets == 1]
        Y = xin[targets == 0]
        # xin1 = (xin - np.min(xin,0))/(np.max(xin,0)-np.min(xin,0))
        if method == 'ranksum':
            s, p = st.ranksums(X, Y)
        elif method == 'kruskal':
            try:
                s, p = st.kruskal(X, Y)
            except:
                p = 1
        elif method == 'ttest':
            s, p = st.ttest_ind(X, Y)
        pval.append(p)
        teststat.append(s)

    pval = np.array(pval)
    pval[np.isnan(pval)] = 1
    # corrected, alpha = bh_corr(np.array(pval), .05)

    reject, corrected, a1, a2 = multipletests(pval, alpha=.05, method='fdr_bh')
    df = pd.DataFrame(np.vstack((pval, corrected, teststat)).T,
                      columns=['P_Val', 'BH corrected', 't-stat'],
                      index=x.columns.values)

    return df.sort_values('P_Val', ascending=True)
 def test_kruskalWallis_hResult(self):
     x1 = [27, 2, 4, 18, 7, 9]
     x2 = [20, 8, 14, 36, 21, 22]
     x3 = [34, 31, 3, 23, 30, 6]
     h, p = kruskal_wallis_test(x1, x2, x3)
     h2, p2 = kruskal(x1, x2, x3)
     assert pytest.approx(h) == h2
Esempio n. 21
0
def extractAssessmentResultOfCommunities(community, assessment, column):
    result = []
    for cSize in community:
        extractedResult = []
        groups = []
        # normTest = []
        for c in cSize:
            temp = assessment.loc[assessment.index.isin(c)]
            extractedResult.append((temp[column].mean(), temp[column].std()))
            # if len(cSize) == 8:
            # k2, p = stats.normaltest(temp[column])
            # normTest.append((k2, p))
            groups.append(temp[column])
        if len(groups) == 5:
            f, p = f_oneway(
                groups[0], groups[1], groups[2], groups[3], groups[4]
            )  #, groups[5],  groups[6] ,  groups[7] )#,  groups[8],  groups[9])
            # ,groups[10], groups[11],  groups[12],  groups[13],  groups[14] , groups[15],  groups[16],  groups[17],  groups[18],  groups[19])
            L, pL = stats.levene(
                groups[0], groups[1], groups[2], groups[3], groups[4]
            )  # , groups[5],  groups[6] ,  groups[7])#,  groups[8],  groups[9])
            #,groups[10], groups[11],  groups[12],  groups[13],  groups[14] , groups[15],  groups[16],  groups[17],  groups[18],  groups[19])
            fk, pk = stats.kruskal(
                groups[0], groups[1], groups[2], groups[3], groups[4]
            )  #, groups[5],  groups[6] ,  groups[7])#,  groups[8],  groups[9])
            #,groups[10], groups[11],  groups[12],  groups[13],  groups[14] , groups[15],  groups[16],  groups[17],  groups[18],  groups[19])
            result.append([
                len(cSize), extractedResult, (f, p), (L, pL), (fk, pk), groups
            ])
        else:
            result.append([len(cSize), extractedResult, groups])
    return result
Esempio n. 22
0
 def non_parametric_tests(self, data1, data2, test_type):
     # Tests whether the distributions of two independent samples are equal or not.
     # Observations in each sample are independent and identically distributed (iid).
     # Observations in each sample can be ranked.
     # H0: the distributions of both samples are equal.
     # H1: the distributions of both samples are not equal.
     if test_type == 'mannwhitneyu':
         stat, p = mannwhitneyu(data1, data2)
         if p > 0.05:
             print('Probably the same distribution')
         else:
             print('Probably different distributions')
     elif test_type == 'wilcoxon':
         stat, p = wilcoxon(data1, data2)
         if p > 0.05:
             print('Probably the same distribution')
         else:
             print('Probably different distributions')
     elif test_type == 'kruskal':
         stat, p = kruskal(data1, data2)
         if p > 0.05:
             print('Probably the same distribution')
         else:
             print('Probably different distributions')
     elif test_type == 'friedmanchisquare':
         stat, p = friedmanchisquare(data1, data2)
         if p > 0.05:
             print('Probably the same distribution')
         else:
             print('Probably different distributions')
Esempio n. 23
0
def test_once(df_orig, df_impute, test='wilcoxon'):
    '''
    Input:
        df_orig: The original dataset with missing value
        df_impute: The dataset after imputation
        test: The statistics test used    
    Output:
        A numpy array containing the p-values of the tests on each column in the column order
    '''
    cols = df_orig.columns
    pvals = np.array([])

    if test == 'wilcoxon':
        for c in cols:
            try:
                stat, pval = wilcoxon(df_orig[c], df_impute[c])
                pvals = np.append(pvals, pval)
            except:
                pvals = np.append(pvals, 0)

    if test == 'kruskal':
        for c in cols:
            stat, pval = kruskal(df_orig[c], df_impute[c], nan_policy='omit')
            pvals = np.append(pvals, pval)

    return pvals
Esempio n. 24
0
def statistic_tests(name_of_file):
    tab1 = []
    tab2 = []

    list_of_rows = read_file(name_of_file)

    fill_tables(tab1, tab2, list_of_rows)
    print('Rank-Sum')
    print('ranksum column 1:', rank_sum(tab1), 'column 2:', rank_sum(tab2))
    print('Kruskal')
    print(kruskal(tab1, tab2))
    print('ANOVA')
    print(f_oneway(tab1, tab2))
    print('Brunner')
    print(brunnermunzel(tab1, tab2))
    print('Whitney')
    print(mannwhitneyu(tab1, tab2))
    print('Barlet')
    print(barlet_test(tab1, tab2))
    print('Levene')
    print(levene_test(tab1, tab2))
    print('Shapiro')
    print('shapiro column 1:', shapiro(tab1), 'column 2:', shapiro(tab2))
    print('T-Student')
    print(ttest_ind(tab1, tab2))
    print('Lilliefors')
    print('liliefors', 'column 1:', lilliefors(tab1), 'column 2:',
          lilliefors(tab2))
Esempio n. 25
0
def _kruskal_wallis_test(table, response_cols, factor_col, nan_policy='propagate'):
    result = dict()
    rb = BrtcReprBuilder()
    rb.addMD("""## Kruskal Wallis test Result""")
    
    groups = dict()
    for name, group in table.groupby(factor_col):
        groups[name] = group
        
    group_name = []
    df = [len(groups) - 1] * len(response_cols)
    stats = []
    pvals = []
    for response_col in response_cols:
        stat, pval = kruskal(*[x[response_col] for x in groups.values()])
        group_name.append(response_col + ' by ' + factor_col)
        stats.append(stat)
        pvals.append(pval)
            
        name = response_col + '_' + factor_col
        result[name] = dict()
        result[name]['Statistics'] = stat
        result[name]['P value'] = pval
        
    rb.addMD(strip_margin("""
    | {table}
    """.format(table=pandasDF2MD(pd.DataFrame({'': group_name, 
                                                'Degree of Freedom': df, 
                                                'Test Statistics': stats, 
                                                'P value': pvals})))))
    result['_repr_brtc_'] = rb.get()
        
    return {'result': result}
Esempio n. 26
0
def print_KruskalWallisH(div_calc):
    """
    Compute the Kruskal-Wallis H-test for independent samples
    """
    h, p = stats.kruskal(*div_calc)
    print "Kruskal-Wallis H-test for {} groups:".format(str(len(div_calc)))
    print "p-value: {}".format(p)
Esempio n. 27
0
def corr_categorical(Pred, Data_col, alpha, _Gaussian=False):
    res = 0

    Data_col.fillna(value='None', inplace=True)
    Tags = Data_col.unique()
    dic_tags = {}

    for tag in Tags:
        tag_index = Data_col.index[Data_col == tag].tolist()
        tag_prices = Pred.ix[tag_index].values
        dic_tags[tag] = tag_prices

    if _Gaussian:  # Normally ditributed <--- ANOVA
        print('WIP : Anova test not implemented')
        # OW_ANOVA = stats.f_oneway(Data_col, Pred)
        return 0
    else:  # Otherwise <--- Kruskal Wallis test
        tuple_arg = ([x for x in list(dic_tags.values())])
        kruskal_res = stats.kruskal(*tuple_arg)
        p_value, H_value = (kruskal_res.pvalue, kruskal_res.statistic)
        if p_value < alpha:
            res = 1
        else:
            res = 0

    return res
    def select_features(self, data, labels):
        """
        Selects interesting features (column indices) from given data matrix using the K-W test
        This test assumes that the compared groups have the same distribution
        :param data: MxN matrix containing features as columns, and samples as rows
        :param labels: Mx1 matrix containing corresponding data labels
        :return: list of indices of interesting features
        """

        num_features = 5

        her2_samples, hr_samples, trip_neg_samples = group_by_classifier(data, labels)

        p_values = np.zeros((data.shape[1]))
        for index in range(data.shape[1]):
            try:
                p_values[index] = \
                stats.kruskal(her2_samples[:, index], hr_samples[:, index], trip_neg_samples[:, index])[1]
            except ValueError:
                p_values[index]=1

        # Multiple testing correction provide no significant variables, we'll stick with this for now
        #significant_p_value_indices = np.asarray(np.where(np.array(p_values) < 0.03))[0]

        significant_p_value_indices = np.asarray(np.argsort(p_values)[0:num_features])

        #significant_p_value_indices = np.asarray(np.where(p_values == p_values.min())[0])

        return significant_p_value_indices
Esempio n. 29
0
def summarize_moi(analytical_sets):

    moi_sets = {}

    for analytical_set in analytical_sets:
        moi_sets[analytical_set.label] = calculate_moi(
            analytical_set.allele_df)

    # because of the non-normality of the dataset, we will just have to use
    # rank-based (parametric/catagorical) statistical test

    stats = {}
    if len(moi_sets) == 2:
        # use Mann-Whitney / Wilxocon rank-sum (non-paired) test
        values = [x.sample_dist['MOI'] for x in moi_sets.values()]
        stats['test'] = 'Wilcoxon ranksum / Mann-Whitney U-test'
        stats['stats'] = ranksums(*values)

    elif len(moi_sets) > 2:
        # use Kruskal-Wallis
        values = [x.sample_dist['MOI'] for x in moi_sets.values()]
        stats['test'] = 'Kruskal-Wallis H-test'
        stats['stats'] = kruskal(*values)

    return (moi_sets, stats)
Esempio n. 30
0
def kruskal_wallis(df, cat_col, num_col, notebook=True):
    """
    Perform kruskal wallis test between the selected columns of the given dataframe.
    Columns need to be continuous
    :param df:
    :param cat_group:
    :param num_col:
    :return:
    """
    variables = []
    for idx, cat_group in df.groupby(cat_col):
        # NAN values not included in computation
        variables.append(cat_group[num_col][cat_group[num_col].notnull()])
    kruskal_h, kruskal_p = ss.kruskal(*variables)
    if print:
        print(f"H-Value: {kruskal_h}, p-Value: {kruskal_p}")
    output = f"\tTest: Kruskal-Wallis\n"
    output += f"\tH-Value: {kruskal_h}, p-Value: {kruskal_p}\n"
    if kruskal_p <= 0.05:
        output += "\tSignificance found \n"
        output += "\tPost-Hoc Tests: Dunns with Bonferonni Correction\n"
        # Remove nan values
        selector = df[cat_col].notnull() & df[num_col].notnull()
        posthoc_data = df[selector]
        posthoc_result = sp.posthoc_dunn(posthoc_data,
                                         num_col,
                                         cat_col,
                                         p_adjust="bonferroni")
        if notebook:
            print(posthoc_result)
        output += str(posthoc_result)
        output += "\n"

    return output
Esempio n. 31
0
 def fit(self, X, y=None):
     """Learn empirical variances from X.
     Parameters
     ----------
     X : {array-like, sparse matrix}, shape (n_samples, n_features)
         Sample vectors from which to compute variances.
     y : any
         Ignored. This parameter exists only for compatibility with
         sklearn.pipeline.Pipeline.
     Returns
     -------
     self
     """
     # calculate correlation matrix
     if isinstance(X, pd.DataFrame):
         self.correlation_matrix_ = X.corr('pearson')
     else:
         X = pd.DataFrame(X)
         self.correlation_matrix_ = X.corr('pearson')
     # calculate the order of feature removal
     if self.score_func == 'f-score':
         F, pval = f_classif(X, y)
         index_arr = np.argsort(F)[::-1]
         self.order = X.columns[index_arr]
     elif self.score_func == 'h-score':
         h_stat = list()
         for col in X.columns:
             statistic, pvalue = kruskal(X.loc[y, col], X.loc[~y, col])
             h_stat.append(statistic)
         h_stat = np.asarray(h_stat)
         index_arr = np.argsort(h_stat)[::-1]
         self.order = X.columns[index_arr]
     return self
Esempio n. 32
0
def ttestForTwoChoiceQuestions(xValues, yValues):
	npArrayX = np.array(xValues)
	npArrayY = np.array(yValues)
	
	# http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.normaltest.html#scipy.stats.normaltest
	xIsNormal = isNormal(npArrayX)
	yIsNormal = isNormal(npArrayY) 
	
	if xIsNormal and yIsNormal:
		# Levene test for equal variances
		# http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.levene.html#scipy.stats.levene
		l, lp = stats.levene(npArrayX, npArrayY)
		parametric = xIsNormal and yIsNormal and lp >- 0.05
	else:
		parametric = False
	
	if parametric:
		# if levene test comes out well and samples are normal, can use standard t-test for independent samples
		# http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html#scipy.stats.ttest_ind
		t, tp = stats.ttest_ind(xValues, yValues, axis=0)
	else:
		# if not, use Kruskal-Wallis H-test instead
		# http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kruskal.html#scipy.stats.kruskal
		t, tp = stats.kruskal(npArrayX, npArrayY)
		t = t / 5.0 # these come out bigger than the t-test stats
	
	return parametric, t, tp
def generate_violion_plots(plot_col, group_col, group_order, ax):
    
    boxes = []
    mus = []
    stds = []
    g_order = []
    for group in group_order:
        mask = group_col == group
        tmp = plot_col[mask].dropna()
        if len(tmp) > 2:
            g_order.append(group)
            boxes.append(tmp.copy().values)
            mus.append(plot_col[mask].mean())
            stds.append(plot_col[mask].std())
        
    if len(boxes) == 2:
        ef = abs(np.diff(mus))/(np.sum(stds))
        ratio = len(boxes[1])/len(boxes[0])
        n0 = tt_ind_solve_power(effect_size=ef, alpha = alpha, power = power, ratio = ratio)
        sizes = [str(int(n0)), str(int(n0*ratio))]
        _, pval = ttest_ind(*boxes)
    else:
        sizes = ['']*len(boxes)
        _, pval = kruskal(*boxes)
    
    labels = ['%s n=%i/%s' % (t, len(b), n) for t, b, n in zip(g_order, boxes, sizes)]
    violinplot(boxes, ax = ax, labels = labels)
    return pval, ax
Esempio n. 34
0
def get_relevance(feat, y_class, relevance_func='mutual_info'):
    from sklearn.feature_selection import \
        chi2, f_classif, mutual_info_classif
    from scipy.stats import kruskal

    feat = np.array(feat)

    if isinstance(relevance_func, str):
        if relevance_func == 'f_classif':
            relevance, _ = f_classif(feat, y_class)
        elif relevance_func == 'chi2':
            relevance, _ = chi2(feat, y_class)
        elif relevance_func == 'mutual_info':
            relevance = mutual_info_classif(feat, y_class)
        elif relevance_func == 'kruskal':
            relevance = np.zeros(feat.shape[1])
            for i, ft in enumerate(feat.T):
                try:
                    relevance[i], _ = kruskal(
                        *[ft[y_class == iy] for iy in np.unique(y_class)])
                except:
                    relevance[i] = np.nan
    else:
        feat = np.array(feat)
        relevance = np.zeros(feat.shape[1])
        for i in range(feat.shape[1]):
            relevance[i] = relevance_func(feat[:, i], y_class)

    return relevance
Esempio n. 35
0
def compare_conc_kruskal(odor):
    '''Do a kruskal wallis test looking at different concentrations of odor
    '''
    xdf = comp_sorted[['Group', '%s01' % odor, '%s05' % odor, '%s10' % odor]]
    xctrl = xdf[xdf['Group'] == 'Control']
    xMS = xdf[xdf['Group'] == 'Mint']
    xHex = xdf[xdf['Group'] == 'Hexanal']
    kctrl=kruskal(xctrl['%s01'%odor],xctrl['%s05'%odor],xctrl['%s10'%odor],nan_policy='omit')
    kmint = kruskal(xMS['%s01' % odor], xMS['%s05' % odor], xMS['%s10' % odor], nan_policy='omit')
    khex = kruskal(xHex['%s01' % odor], xHex['%s05' % odor], xHex['%s10' % odor], nan_policy='omit')
    print 'Control group'
    print kctrl
    print 'Mint group'
    print kmint
    print 'Hexanal group'
    print khex
Esempio n. 36
0
def KW_test_diversity(array1, array2, array3=None):
    """This function performs the Kruskal-Wallis test given at least 2 array.
    Input:
    - array1: The first numpy array
    - array2: The second numpy array
    - array3: Optional, the third numpy array)
    Output:
    - Print the statistical measure with its corresponding P-value"""
    #If a third array is given
    if array3 != None:
        #Perform Kruskal-Wallis test
        print(stats.kruskal(array1, array2, array3))
    #If only 2 arrays are given
    else:
        #Perform the Kruskal-Wallis test
        print(stats.kruskal(array1, array2))
def study_stability(datas, stable_threshold):
    print(
        f"[INFO] 0. Analysis of stable features using Kruskal-Wallis test:\n Each features that shows a p-value below {stable_threshold} for stability Kruskal test will be discarded"
    )

    columns = datas["abs64"].columns
    stable = []
    kruskal = {}
    print(
        f"[INFO] 0. Analysis of stable features using Kruskal-Wallis test:\n Each features that shows a p-value below {stable_threshold} for stability Kruskal test will be discarded"
    )
    for c in columns:
        print(f"[RUN] Running analysis for {c}")
        try:
            s, p = stats.kruskal(datas["abs64"][c], datas["abs128"][c],
                                 datas["abs256"][c])
            kruskal[c] = p
            if p > stable_threshold:
                stable.append(c)
        except:
            pass

    print(
        f"[STABILITY] {len(stable)}/{len(columns)} features passed the stability test."
    )

    datas["abs64"] = datas["abs64"][stable]
    datas["abs128"] = datas["abs128"][stable]
    datas["abs256"] = datas["abs256"][stable]
    return datas, stable, kruskal
Esempio n. 38
0
def calc_p_values(data, gt1_name, gt2_name,
                  stat_colname=None,
                  num_bins=50, bin_how='mean',
                  ):

    if stat_colname is None:
        raise ValueError("you must explicitly set stat_colname (try 'maxWingAngle')")
    
    data.index = data.index.astype(np.int64)  #LAZY DANNO. DROP TIMESTAMPS FOR BINNING.
    data['synced_ns'] = data.index
    
    df_ctrl = data[data.group == gt1_name][['FlyID', stat_colname, 'synced_ns']]
    df_exp = data[data.group == gt2_name][['FlyID', stat_colname, 'synced_ns']]

    align_start = df_ctrl.index.min()
    dalign = df_ctrl.index.max() - align_start

    p_values = DataFrame()

    if bin_how=='mean':
        bin_func = np.mean
    elif bin_how=='median':
        bin_func = np.median

    bins = np.linspace(0,dalign,num_bins+1) + align_start
    binned_ctrl = pd.cut(df_ctrl.index, bins, labels= bins[:-1])
    binned_exp = pd.cut(df_exp.index, bins, labels= bins[:-1])
    for x in binned_ctrl.levels:
        test1_full_dataset = df_ctrl[binned_ctrl == x]
        test2_full_dataset = df_exp[binned_exp == x]
        bin_start_time = test1_full_dataset['synced_ns'].min()
        bin_stop_time = test1_full_dataset['synced_ns'].max()

        test1 = []
        for obj_id, fly_group in test1_full_dataset.groupby('FlyID'):
            test1.append( bin_func(fly_group[stat_colname].values) )
        test1 = np.array(test1)
        
        test2 = []
        for obj_id, fly_group in test2_full_dataset.groupby('FlyID'):
            test2.append( bin_func(fly_group[stat_colname].values) )
        test2 = np.array(test2)
        
        try:
            hval, pval = kruskal(test1, test2)
        except ValueError as err:
            pval = 1.0

        dftemp = DataFrame({'Bin_number': x,
                            'P': pval,
                            'bin_start_time':bin_start_time,
                            'bin_stop_time':bin_stop_time,
                            'name1':gt1_name, 
                            'name2':gt2_name,
                            'test1_n':len(test1),
                            'test2_n':len(test2),
                            }, index=[x])
        p_values = pd.concat([p_values, dftemp])
    return p_values
Esempio n. 39
0
def kruskal_wallis(data):
    """
    non parametric
    many samples
    independent
    """
    H, pval = st.kruskal(*data)
    return (H, pval)
Esempio n. 40
0
File: anova.py Progetto: gmat/emzed2
def kruskalWallisOnTables(tableSet1, tableSet2, idColumn, valueColumn):
    """
       Works as :py:meth:`~emzed.stats.oneWayAnovaOnTables` above, but uses non parametric kruskal wallis test.
    """
    result = _runStatistcsOnTables(tableSet1, tableSet2, idColumn, valueColumn,
             lambda s1, s2: kruskal(s1, s2)[1])
    result.title = "KRUSKAL WALLIS ANALYSIS"
    return result
Esempio n. 41
0
def _evalstat(x, bsl, meth, n_perm, metric, maxstat, tail):
    """Statistical evaluation of features

    [x] = [xn] = (nFce, npts, nTrials)
    [bsl] = (nFce, nTrials)
    """
    # Get shape of xF :
    nf, npts, nt = x.shape
    pvalues = np.ones((nf, npts))

    # Permutations :
    if meth == 'permutation':
        perm = perm_swaparray(a, b, n_perm=200, axis=-1, rndstate=0)
        from brainpipe.xPOO.stats import permutation
        # Pre-define permutations :
        pObj = permutation(n_perm)
        perm = np.zeros((n_perm, nf, npts))
        # For each permutation :
        for p in range(n_perm):
            # Get 1D iterations :
            ite = product(range(nf), range(npts))
            permT = np.random.permutation(2*nt)
            for f, pts in ite:
                bs, xs = bsl[f, :], x[f, pts, :]
                # Reshape data :
                subX = np.vstack((bsl[f, :], x[f, pts, :])).reshape(2*nt,)
                # Shuffle data :
                subX = subX[permT].reshape(nt, 2)
                # Normalize data :
                subX = normalize(subX[:, 0], subX[:, 1], norm=norm)
                # Get mean of data :
                perm[p, f, pts] = np.mean(subX)
        # Get final pvalues :
        pvalues = pObj.perm2p(np.mean(xn, 2), perm, tail=tail,
                              maxstat=maxstat)

    # Wilcoxon test :
    elif meth == 'wilcoxon':
        from scipy.stats import wilcoxon
        # Get iterations :
        ite = product(range(nf), range(npts))
        # Compute wilcoxon :
        for k, i in ite:
            _, pvalues[k, i] = wilcoxon(x[k, i, :], bsl[k, :])

    # Kruskal-Wallis :
    elif meth == 'kruskal':
        from scipy.stats import kruskal
        # Get iterations :
        ite = product(range(nf), range(npts))
        # Compute Kruskal-Wallis :
        for k, i in ite:
            _, pvalues[k, i] = kruskal(x[k, i, :], bsl[k, :])

    return pvalues
Esempio n. 42
0
def calc_p_values(_data,
                  stat_colname=None,
                  num_bins=50, bin_how='mean',
                  ):

    if stat_colname is None:
        raise ValueError("you must explicitly set stat_colname (try 'maxWingAngle')")
    
    _data.index = _data.Time  #LAZY DANNO. DROP TIMESTAMPS FOR BINNING.
    _data = _data.sort('Time')
    _data['synced_ns'] = _data.index
    df_baseline = _data[_data['Time'] < 10.0]
    align_start = _data.Time.min()
    dalign = int(_data.Time.max()) - int(align_start)
    dalign = _data.Time.max() - align_start
    p_values = DataFrame()

    if bin_how=='mean':
        bin_func = np.mean
    elif bin_how=='median':
        bin_func = np.median

    bins = np.linspace(0,dalign,num_bins+1) + align_start
    binned_data = pd.cut(_data.index, bins, labels= bins[:-1])

    baseline = df_baseline[stat_colname].values
    bin_number = 0
    for x in binned_data.levels:
        #test_df = data.loc[(data.index > binned_data.levels[x]) & (data.index <= binned_data.levels[x+1]), stat_colname].values
        test_df = _data.loc[binned_data == x, stat_colname]
        bin_start_time = x
        bin_stop_time = _data.loc[binned_data == x, 'Time'].max()
        test = np.array(test_df)
        
        try:
            hval, pval = kruskal(baseline, test)
        except ValueError as err:
            pval = 1.0

        dftemp = DataFrame({'Bin_number': bin_number,
                            'P': pval,
                            'bin_start_time':bin_start_time,
                            'bin_stop_time':bin_stop_time,
                            'name1':'baseline',
                            'name2':stat_colname,
                            'test1_n':len(baseline),
                            'test2_n':len(test),
                            }, index=[x])
        p_values = pd.concat([p_values, dftemp])
        bin_number +=1



    return p_values
def feature_kw(feature, data):
    feature_list = [(key, group[feature]) for key, group in data.items()]
    h, p = stats.kruskal(feature_list[0][1], feature_list[1][1], feature_list[2][1], feature_list[3][1])

    print ('Kruskal-Wallace: %s' % feature)
    print ('=============')
    for i in feature_list:
        print ('%s: %.3f +- %.3f' % (i[0], np.median(i[1]) * 1e3, np.std(i[1]) * 1e3))
    print ('H value: %.3f' % h)
    print ('P value: %.5f \n' % p)
    return feature_list
Esempio n. 44
0
def testRelationCorrectIncorrect():
    P = np.zeros(len(correct))
    P_discret = np.zeros(len(correct))
    for i in xrange(len(correct)):
        #KS, p = stats.ks_2samp(correct[i], incorrect[i])
        KS, p = stats.kruskal(correct[i], incorrect[i])
        P[i] = p
    P_discret[P < 0.01] = 3
    P_discret[(P > 0.01)*(P < 0.05)] = 2
    P_discret[(P > 0.05)*(P < 0.1)] = 1
    P_discret[P > 0.1] = 0
    return P, P_discret
def anova(x, y):
    grouped = defaultdict(list)
    [grouped[x_val].append(y_val) for x_val, y_val in zip(x, y)]
    grouped_values = grouped.values()
    if len(grouped_values) < 2:
        return (0, 0, 0, 0)
    f_oneway_res = list(f_oneway(*grouped_values))
    try:
        kruskal_res = list(kruskal(*grouped_values))
    except ValueError:  # when all numbers are identical
        kruskal_res = [0, 0]
    return f_oneway_res + kruskal_res
Esempio n. 46
0
def snr(M, list1, list2, threshold = None, significance = False):
    """

    Performs a signal-to-noise ratio test on M, assuming samples are in rows and genes are in columns

        list1       - List of row indices for first group
        list2       - List of row indices for second group
        threshold   - Minimum SNR ratio to report
        significance - Run kruskal ttest (requires scipy)

    Returns a reverse-ordered list of (ratio, index, mean1, mean2, pvalue) tuples, where index is the column index of the gene,
    and mean1 and mean2 correspond to the mean for that particular gene in list1 and list2, respectively.  pvalue is blank if significance
    is False.

    If signifance is true (and scipy is installed) a pvalue will be assigned. Be ware this increases processing
    time significantly (ha).

    """

    ratios = []

    N1 = M.take(tuple(list1), 0)
    N2 = M.take(tuple(list2), 0)

    N1mean, N2mean = N1.mean(0), N2.mean(0)
    means = numpy.abs(N1mean - N2mean)
    stds  = N1.std(0) + N2.std(0)

    if stds.all():
        rats = means / stds
    else:
        rats = numpy.zeros((len(means),), dtype=numpy.float32)
        for i in xrange(len(stds)):
            if stds[i]:
                rats[i] = means[i] / stds[i]

    for i in xrange(M.shape[1]):

        rat = rats[i]
        mean1, mean2 = N1mean[i], N2mean[i]

        if threshold is None or rat >= threshold:

            if PVAL and significance:
                pval = st.kruskal(N1[:,i], N2[:,i])[1]
            else:
                pval = ''
    
            ratios.append( (rat, i, mean1, mean2, pval) )

    ratios.sort(reverse=True)

    return ratios
Esempio n. 47
0
def kruskal_p(hit_vec, response_vec, min_size=5):
    '''
    Wrapper to do a one way anova on pandas Series
    ------------------------------------------------
    hit_vec: Series of labels
    response_vec: Series of measurements
    '''
    try:
        hit_vec, response_vec = match_series(hit_vec, response_vec)
        return kruskal(*[response_vec[hit_vec == num] for num in 
                          hit_vec.unique()])[1]
    except:
        return nan
Esempio n. 48
0
    def ANM_causation_score(self,train_size=0.5,independence_criterion='HSIC',metric='linear',regression_method='GP'):
        '''
            Measure how likely a given causal direction is true

            Parameters
            ----------
            train_size :
                Fraction of given data used to training phase

            independence_criterion :
                kruskal for Kruskal-Wallis H-test,
                HSIC for Hilbert-Schmidt Independence Criterion

            metric :
                linear, sigmoid, rbf, poly
                kernel function to compute gramm matrix for HSIC
                gaussian kernel is used in :
                Nonlinear causal discovery with additive noise models
                Patrik O. Hoyer et. al

            Returns
            -------
            causal_strength: A float between 0. and 1.
        '''
        Xtrain, Xtest , Ytrain, Ytest = train_test_split(self.X, self.Y, train_size = train_size)
        if regression_method == 'GP':
            _gp = pyGPs.GPR()      # specify model (GP regression)
            _gp.getPosterior(Xtrain, Ytrain) # fit default model (mean zero & rbf kernel) with data
            _gp.optimize(Xtrain, Ytrain)     # optimize hyperparamters (default optimizer: single run minimize)

            #Forward case
            #_gp = KernelRidge(kernel='sigmoid',degree=3)
            #_gp.fit(Xtrain,Ytrain)
            ym, ys2, fm, fs2, lp = _gp.predict(Xtest)
            #_gp.plot()
            #errors_forward = _gp.predict(Xtest) - Ytest
            errors_forward = ym - Ytest
        else:
            _gp = KernelRidge(kernel='sigmoid')
            _gp.fit(Xtrain, Ytrain)
            errors_forward = _gp.predict(Xtest) - Ytest

        #Independence score

        forward_indep_pval = {
            'kruskal': kruskal(errors_forward,Xtest)[1],
            'HSIC': self.HilbertSchmidtNormIC(errors_forward,Xtest,metric=metric)[1]
        }[independence_criterion]


        return {'causal_strength':forward_indep_pval}
Esempio n. 49
0
def kruskal_pandas(hit_vec, response_vec, min_size=5):
    '''
    Wrapper to do a one way anova on pandas Series
    ------------------------------------------------
    hit_vec: Series of labels
    response_vec: Series of measurements
    '''
    try:
        hit_vec, response_vec = _match_series(hit_vec, response_vec)
        res = stats.kruskal(*[response_vec[hit_vec == num] for num in 
                          hit_vec.unique()])
        return pd.Series(res, index=['H','p'])
    except:
        return pd.Series(index=['H','p'])
Esempio n. 50
0
def plot_stats( groupedData, fig_prefix, cutoff='baseline',  **kwargs):
    """ data = output from flymad_jaaba_v6.py (rawdata_**s.pickle), with synced_time column representing seconds, grouped by 'group'.   
        names = list of groups (ex. ['foo','bar','baz'])
        fig_prefix = full path and filename (without extension) of plot name.
        **kwargs = 
    """

    fig = plt.figure(figsize=(4,3))
    ax = fig.add_subplot(111)
    
    for GROUP, data in groupedData:
        colour = colourlist[groupedData.groups.keys().index(GROUP)]
        pvalue_results = {}
        if cutoff == 'baseline':
            ax.set_title('Kruskal Wallis: '+parameter+' vs baseline:', fontsize=12)
            baseline = data[data.synced_time <=0][parameter].values    
            for time, _data in data[(data.synced_time > 0) & (data.synced_time <= 360)].groupby('synced_time'):
                pvalue_results[time*args.binsize] = st.kruskal(baseline, _data[parameter])[1]
        elif cutoff == 'zero':
            ax.set_title('Kruskal Wallis: '+parameter+' vs zero:', fontsize=12)
            for time, _data in data[(data.synced_time > 0) & (data.synced_time <= 360)].groupby('synced_time'):
                pvalue_results[time*args.binsize] = st.ttest_1samp(_data[parameter], 0)[1]

        pvalue_results = {k: pvalue_results[k] for k in pvalue_results if not isnan(pvalue_results[k])} 

        ax.scatter(pvalue_results.keys(), -np.log10(pvalue_results.values()), label=GROUP, color=colour, linewidth=0)
    
    if len(pvalue_results)>=1:
        n_comparisons = len(pvalue_results)
        ax.axhline( -np.log10(0.05/n_comparisons), color='k', lw=0.5, linestyle='--' )

    ax.set_xlim(0,360)
    ax.set_ylim(0,8)#1.1*max(-np.log10(pvalue_results.values())))
    ax.set_xlabel('Time (s)', fontsize=12)
    ax.set_ylabel('-Log10(P)', fontsize=12)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    l = plt.legend()
    l.set_zorder(1000)
    plt.tick_params(axis='both', which='major', labelsize=12)
    plt.tight_layout()
    for ext in ['.png','.svg']:
        fig_fname = fig_prefix + '_'+parameter + ext
        fig.savefig(fig_fname, bbox='tight')
        print 'saved',fig_fname

    return pvalue_results
Esempio n. 51
0
def print_KruskalWallisH(div_calc):
    """
    Compute the Kruskal-Wallis H-test for independent samples. A typical rule is that
    each group must have at least 5 measurements.
    """
    calc = defaultdict(list)
    try:
        for k1, v1 in div_calc.iteritems():
            for k2, v2 in v1.iteritems():
                calc[k1].append(v2)
    except:
        return "Error setting up input arrays for Kruskal-Wallis H-Test. Skipping "\
               "significance testing."
    h, p = stats.kruskal(*calc.values())
    print "\nKruskal-Wallis H-test statistic for {} groups: {}".format(str(len(div_calc)), h)
    print "p-value: {}".format(p)
Esempio n. 52
0
def show_drinking_behavior(basepath=None,compare_distributions=True,
	visualize_one_random_actor=False, visualize_all_actors=True):
	agents = np.loadtxt(os.path.join(basepath,'responders'),delimiter=TAB)
	filename = os.path.join(basepath,'drinking-behavior.txt')
	drinking_behavior = np.loadtxt(filename,delimiter=TAB)

	if compare_distributions:		
		fig = plt.figure()
		ax = fig.add_subplot(111)
		H,p = kruskal(drinking_behavior[:,INITIAL],drinking_behavior[:,END])

		initial_distribution = drinking_behavior[:,INITIAL]
		final_distribution = drinking_behavior[:,END]

		low = min(initial_distribution.min(),final_distribution.min())
		high = max(initial_distribution.max(),final_distribution.max())

		ax.hist(initial_distribution,color='r',alpha=0.5,bins=20,label='Initial',range=(low,high))
		ax.hist(final_distribution,color='k',alpha=0.5,bins=20,label='Final', range=(low,high))
		artist.adjust_spines(ax)
		ax.set_xlabel(artist.format('Intent to drink'))
		ax.set_ylabel(artist.format('Prevalence'))
		plt.legend(frameon=False)
		filename = os.path.join(os.getcwd(),basepath,'drinking-behavior-compare-distributions.png')
		plt.savefig(filename,dpi=300)

	if visualize_one_random_actor:
		fig = plt.figure()
		ax = fig.add_subplot(111)
		random_actor = random.choice(xrange(drinking_behavior.shape[0]))
		ax.plot(drinking_behavior[random_actor,:],'k--',linewidth=2)
		artist.adjust_spines(ax)
		ax.set_ylabel(artist.format('Past drinking behavior'))
		ax.set_xlabel(artist.format('Time'))
		filename = os.path.join(os.getcwd(),basepath,'drinking-behavior-visualize-actor.png')	
		plt.savefig(filename,dpi=300)

	if visualize_all_actors:
		fig = plt.figure()
		ax = fig.add_subplot(111)
		cax = ax.imshow(drinking_behavior,interpolation='nearest',aspect='auto')
		artist.adjust_spines(ax)
		ax.set_ylabel(artist.format('Actor'))
		ax.set_xlabel(artist.format('Time'))
		plt.colorbar(cax)
		filename = os.path.join(os.getcwd(),basepath,'drinking-behavior-visualize-all-actors.png')		
		plt.savefig(filename,dpi=300)
Esempio n. 53
0
def stats_pairwise(_dataset, _column, _within, _between):
    fulldf = pd.DataFrame()
    for grp in list(set(_dataset[_within])):
        df = _dataset[_dataset[_within] == grp] 
        g = df.groupby(_between)
        groups = list(g.groups)
        data = [col for col_name, col in g[_column]]
        datanames = [col_name for col_name, col in g[_column]]
        #pairs = get_pairs(g.groups)
        p_vals = []
        pairs = []
        for pair in get_pairs(range(len(data))):
            T, P = ss.kruskal(data[pair[0]], data[pair[1]])
            p_vals.append(P)
            pairs.append((datanames[pair[0]], datanames[pair[1]]))
        tempdf = pd.DataFrame({'within':grp, 'between':pairs, 'measure':_column, 'p':p_vals})
        fulldf = pd.concat([fulldf,tempdf], axis=0)     
    return fulldf    
def calc_kruskal(x, sample_num_l, alpha):
	tmp_input_l = split_list(x[1:],sample_num_l) #ignore id column

	try:
		h,p = stats.kruskal(*tmp_input_l) #run kruskal-wallist test
#		h,p = stats.f_oneway(*tmp_input_l)
	except ValueError:
		return x+['1.00','0']
	
	if math.isnan(p) :
		return x+['1.00','0']	

	result = []

	if p < alpha :
		num = len(sample_num_l)
		
		pval_l = []
		
		for i in range(num-1):
			for j in range(i+1, num):
				tmp_p = 0.0
				try:
					tmp_u, tmp_p = stats.mannwhitneyu(tmp_input_l[i],tmp_input_l[j]) #This is one-sied result
				except ValueError :
					tmp_p = 0.5

				pval_l.append(tmp_p*2)
		
		rej = smm.multipletests(pval_l, alpha=alpha, method='fdr_bh')[0] # fdr correction
		
		flag = 1

		for i in range(len(rej)):
			if ~rej[i] :
				flag = 0
				break

		result = [`p`,`flag`]

	else:
		result = [`p`,'0']
	
	return x+result
def KruskalWallis(data):
    '''Non-parametric comparison between the groups'''
    
    print('\n Kruskal-Wallis test ----------------------------------------------------')
    
    # First, I get the values from the dataframe
    g_a = data['weight'][data['group']=='TreatmentA']
    g_b = data['weight'][data['group']=='TreatmentB']
    g_c = data['weight'][data['group']=='Control']
    
    #Note: this could also be accomplished with the "groupby" function from pandas
    #groups = pd.groupby(data, 'group')
    #g_a = groups.get_group('TreatmentA').values[:,1]
    #g_c = groups.get_group('Control').values[:,1]
    #g_b = groups.get_group('TreatmentB').values[:,1]
    
    # Then do the Kruskal-Wallis test
    h, p = stats.kruskal(g_c, g_a, g_b)
    print('Result from Kruskal-Wallis test: p = {0}'.format(p))
def KruskalWallis(data):
    """Non-parametric comparison between the groups"""

    print("\n Kruskal-Wallis test ----------------------------------------------------")

    # First, I get the values from the dataframe
    g_a = data["weight"][data["group"] == "TreatmentA"]
    g_b = data["weight"][data["group"] == "TreatmentB"]
    g_c = data["weight"][data["group"] == "Control"]

    # Note: this could also be accomplished with the "groupby" function from pandas
    # groups = pd.groupby(data, 'group')
    # g_a = groups.get_group('TreatmentA').values[:,1]
    # g_c = groups.get_group('Control').values[:,1]
    # g_b = groups.get_group('TreatmentB').values[:,1]

    # Then do the Kruskal-Wallis test
    h, p = stats.kruskal(g_c, g_a, g_b)
    print("Result from Kruskal-Wallis test: p = {0}".format(p))
Esempio n. 57
0
def compare_feature_groups(fg1,fg2,variance=False,name='Comparison'):

	ttest = stats.ttest_ind(fg1,fg2,equal_var = variance)
	ktest = stats.kruskal(fg1,fg2)
	rktest = stats.ranksums(fg1,fg2)

	temp = '''

Stats Comparsion [{1}]
----------------------------------------------
Tests 	  |	P-Value
----------------------------------------------
Student-T   |    {0}
Kruskal     |    {2}
RankSum     |    {3}


	'''
	print temp.format(ttest[1],name,ktest[1],rktest[1])
	return ttest[1] > 0.05
Esempio n. 58
0
def non_para(data,var,cat,method='Wilcoxon'):
    """Do non-parametric test comparing values for a given variable (specified by argument 'var') between data grouped by a given category (specified by argument 'cat'); data can be a pandas DataFrame or a dictionary.
    There are two method options: Wilcoxon and Kruskal.
    Two matrices are returned, the first one containing p-value of the test (and therefore is symmetric), the second containing difference between median of the two categories (row minus column)."""
    cats = list(set(data[cat]))
    p_value = np.zeros((len(cats),len(cats)))
    diff = np.zeros((len(cats),len(cats)))
    for i1 in range(len(cats)):
        for i2 in range(len(cats)):
            if method=='Wilcoxon':
                p_value[i1,i2] = round(stats.ranksums(data[var][data[cat]==cats[i1]],data[var][data[cat]==cats[i2]])[1],3)
            elif method=='Kruskal':
                p_value[i1,i2] = round(stats.kruskal(data[var][data[cat]==cats[i1]],data[var][data[cat]==cats[i2]])[1],3)
            else:
                print 'No such method'
                return 
            diff[i1,i2] = data[var][data[cat]==cats[i1]].median()-data[var][data[cat]==cats[i2]].median()
    p_value = pd.DataFrame(p_value,index=cats,columns=cats)
    diff = pd.DataFrame(diff,index=cats,columns=cats)
    result = {'p':p_value,'med_diff':diff}
    return result