def anova(dataname, nparray1, nparray2):
	if nparray1.ndim > 1:
		H, pval = mstats.kruskalwallis(np.nanmean(nparray1, axis=1), np.nanmean(nparray2, axis=1))
		print "anova: ", dataname, ': Mean of array wt, mut, H-stat, P-value: ', str(np.nanmean(np.nanmean(nparray1,axis=1))), str(np.nanmean(np.nanmean(nparray2,axis=1))), str(H), str(pval)
	else:
		H, pval = mstats.kruskalwallis(nparray1, nparray2)
		print "anova: ", dataname, ': Mean of array wt, mut, H-stat, P-Value: ', str(np.nanmean(np.nanmean(nparray1))), str(np.nanmean(np.nanmean(nparray2))), str(H), str(pval)
Exemple #2
0
def kruskal_wallis(field, coerce=None):
    h1, p1 = kruskalwallis(*group_by_sentiment(field, coerce).values())
    h2, p2 = kruskalwallis(*group_by_nps(field, coerce).values())
    print('\nKruskal-Wallis H-Test on %s:' % field)
    print('  - When grouped by ordinal value:')
    print('    - %s' % singificant(p1))
    print('    - H = %s' % h1)
    print('    - p = %s' % p1)
    print('  - When grouped by Net Promoter Score:')
    print('    - %s' % singificant(p2))
    print('    - H = %s' % h2)
    print('    - p = %s' % p2)
Exemple #3
0
    def anova(self, min_mean_expr=None):
        """
        carry out non-parametric ANOVA across the groups of self.

        :param min_mean_expr: minimum average gene expression value that must be reached
          in at least one cluster for the gene to be considered
        :return:
        """
        if self._anova is not None:
            return self._anova

        # run anova
        f = lambda v: kruskalwallis(*np.split(v, self.split_indices))[1]
        pvals = np.apply_along_axis(
            f, 0, self.data)  # todo could shunt to a multiprocessing pool

        # correct the pvals
        _, pval_corrected, _, _ = multipletests(pvals,
                                                self.alpha,
                                                method='fdr_tsbh')

        # store data & return
        if self.index is not None:
            self._anova = pd.Series(pval_corrected, index=self.index)
        else:
            self._anova = pval_corrected
        return self._anova
    def run(self):
        matrix = self.dataset.matrix.transpose()  #we want to compare the genes
        p_values = []
        h_statistics = []
        classes = np.unique(self.dataset.labels)
        if len(classes) != 3:
            raise Exception("This implementation is for 3 classes.")

        for line in matrix.tolist():
            #devide gene's values into 2 classes (samples)
            sample1 = [
                line[i] for i in range(len(line))
                if self.dataset.labels[i] == classes[0]
            ]
            sample2 = [
                line[i] for i in range(len(line))
                if self.dataset.labels[i] == classes[1]
            ]
            sample3 = [
                line[i] for i in range(len(line))
                if self.dataset.labels[i] == classes[2]
            ]

            h, p_value = mstats.kruskalwallis(np.array(sample1),
                                              np.array(sample2),
                                              np.array(sample3))
            p_values.append(p_value)
            h_statistics.append(h)
        return h_statistics, p_values
def plot_var_dist(plotargs, kkey, kw_xy=(20,20), color="muted"):

    f, ax = plt.subplots(1,1, figsize=(12, 4), sharex=True)
    cpalette = sb.color_palette(color)

    for arg in plotargs:
        df, label, color_num = arg
        color = cpalette[color_num]
        # Summary stats:
        mean = df[kkey].mean()
        med  = df[kkey].median()
        std  = df[kkey].std()
        skew = df[kkey].skew()
        stat = u"\nμ={:0.2f}   med={:0.2f}\nσ={:0.2f}   N={}".format(
                mean, med, std, len(df))
        label += stat

        yvals, xvals, patchs = plt.hist(df[kkey].tolist(), bins=100, label=label,
                                color=color, alpha=0.6, histtype='stepfilled')

    H, prob = kruskalwallis(*[x[0][kkey] for x in plotargs])
    # U, prob = mannwhitneyu(*[x[0][kkey] for x in plotargs])
    ax.annotate("Kruskal-Wallis:\nH={H:.2f}\nprob={p:.3f}".format(H=H, p=prob),
                xy=(kw_xy[0], kw_xy[1]))
    plt.ylabel("Frequency")
    plt.legend()
def sb_distplots(plotargs, return_key='close_return', update_type='Revisions'):
    "Plots conditional underpricing distributions. Run set_data(df) first."

    f, ax = plt.subplots(1,1,figsize=(16, 5), sharex=True)
    for arg in plotargs:
        df, c, l, h = arg

        sb.distplot(df[return_key], ax=ax,
            kde_kws={"label": l + "    Obs={N}".format(N=len(df)), "color": c},
            hist_kws={"histtype": "stepfilled", "color": c})

        r = df[return_key]
        m,s,y,med = r.mean(), r.std(), r.skew(), r.median()
        ax.annotate(
            u'μ={:.2f}%,   σ={:.2f},   γ={:.2f}'.format(m,s,y),
            xy=(med+2, h), xytext=(med+6, h+0.01),
            arrowprops=dict(facecolor=cl.rgb2hex(c), width=1.5, headwidth=5, shrink=0.1))


    H, prob = kruskalwallis(*[x[0][return_key] for x in plotargs])
    ax.annotate("Kruskal-Wallis: (H={H:.2f}, prob={p:.3f})".format(H=H, p=prob),
                xy=(66,0.01))

    plt.title("Conditional Underpricing Distributions %s" % update_type)
    plt.ylabel("Density")
    plt.xlim(xmin=-40,xmax=100)
    plt.xlabel("1st Day Returns (%)")
    plt.ylim((0, 0.12))
Exemple #7
0
def kwallis(df, x_col=None, y_col=None, *args, **kwargs):
    if len(df) < 1:
        return 0
    if x_col is not None:
        x = df[x_col].values
    else:
        x = df.x
    if y_col is not None:
        y = df[y_col]
    else:
        y = df.y
    try:
        return float(kruskalwallis(x, y)[0])
    except Exception as e:
        print kruskalwallis(x, y)
        raise e
Exemple #8
0
def check_kw(resid_4d):
    """
    Kruskal-Wallis tests the null hypothesis that the population 
    median of all of the groups are equal. In particular, this 
    function performs a Kruskal-Wallis test for each voxel's 
    residuals against a sample from the normal distribution. 

    Parameters
    ---------
    resid_4d: residual data of 4D numpy array
    
    Returns
    -------
    kw_normality: 3D array of p-values.
    
    """
    kw_3d = np.zeros(resid_4d.shape[:-1])
    for i in range(resid_4d.shape[0]):
        for j in range(resid_4d.shape[1]):
            for k in range(resid_4d.shape[2]):
                norm_samp = np.random.normal(np.mean(resid_4d[i, j, k, :]),
                                             np.std(resid_4d[i, j, k, :]),
                                             resid_4d.shape[-1])
                junk, kw_3d[i, j, k] = kruskalwallis(resid_4d[i, j, k, :],
                                                     norm_samp)
    return kw_3d
def anova(dataname, nparray1, nparray2):
	if nparray1.ndim > 1:
		nanmean1 = np.nanmean(np.nanmean(nparray1, axis=1))
		#print("nanamean1: "+str(nanmean1))
		nanvar1 = np.nanvar(np.nanmean(nparray1, axis=1))
		nanmean2 = np.nanmean(np.nanmean(nparray2, axis=1))
		#print("nanamean2: "+str(nanmean2))
		nanvar2 = np.nanvar(np.nanmean(nparray2, axis=1))
		H, pval = mstats.kruskalwallis(np.nanmean(nparray1, axis=1), np.nanmean(nparray2, axis=1))
		print("anova: ", dataname, ': N of control, test, Mean of array control, test, Variance of array control, test, SSMD, H-stat, P-value: ', len(np.nanmean(nparray1, axis=1)),len(np.nanmean(nparray2, axis=1)), str(nanmean1), str(nanmean2), str(nanvar1), str(nanvar2), str((nanmean1 - nanmean2) / math.sqrt(nanvar1+nanvar2)), str(H), str(pval))
	else:
		nanmean1 = np.nanmean(np.nanmean(nparray1))
		nanvar1 = np.nanvar(nparray1)
		nanmean2 = np.nanmean(np.nanmean(nparray2))
		nanvar2 = np.nanvar(nparray1)
		H, pval = mstats.kruskalwallis(nparray1, nparray2)
		print("anova: ", dataname, ': N of control, test, Mean of array control, test, Variance of array control, test, SSMD, H-stat, P-value: ', len(np.nanmean(nparray1)), len(np.nanmean(nparray2)), str(nanmean1), str(nanmean2), str(nanvar1), str(nanvar2), str((nanmean1 - nanmean2) / math.sqrt(nanvar1+nanvar2)), str(H), str(pval))
Exemple #10
0
def kruskal(df, alpha=0.05):
    num_df = df.select_dtypes(include=np.number)
    kruskal_pvalues = np.empty(len(num_df.columns))
    for ind, col in enumerate(num_df.columns):
        test = kruskalwallis(
            *[group[col].values for name, group in df.groupby("ACTIVITY")])
        kruskal_pvalues[ind] = test.pvalue
    return num_df.columns[kruskal_pvalues > alpha].values
Exemple #11
0
    def sig_relationship_kruskalwallis(self, frame):
        factors = frame.dimension.unique().tolist()
        data_sets = list()
        for factor in factors:
            data_sets.append(frame.ix[frame.dimension == factor, 'value'])

        if len(data_sets) < 2:
            return 1
        else:
            return kruskalwallis(*data_sets)[1]
Exemple #12
0
def test(matrix, columns=-1):

    if columns == -1:
        columns = range(0, matrix.shape[1])
    ##Make the matrix understandable for the test function
    aux = []
    for i in columns:
        aux.append(matrix[:, i])

    return (kruskalwallis(aux))
def compare_groups(df,
                   group0,
                   group1,
                   group0_name='group0',
                   group1_name='group1'):
    """
    Calculate log2 fold change and tests for statistical difference 
    (Kruskal-Wallis + FDR correction)
    :param <pd.DataFrame>: Table with normalized read counts for each GO_ID and sample
    :param group0 <list>: Samples of group 0
    :param group1 <list>: Samples of group 1
    :return <pd.DataFrame>: Table with added statistics
    """

    dict_results = {
        GOID: {
            'pval': 0.0,
            'log2fc': 0.0,
            'mean_{}_tpm'.format(group0_name): 0.0,
            'mean_{}_tpm'.format(group1_name): 0.0,
            'padj': 0.0
        }
        for GOID in df.index
    }

    for GOID in df.index:
        GO_group0 = np.array(df.loc[GOID, group0])
        GO_group1 = np.array(df.loc[GOID, group1])

        try:
            H, pval = mstats.kruskalwallis(GO_group0, GO_group1)
        except:
            pval = np.nan
        dict_results[GOID]['pval'] = pval

        mean_group0 = np.mean(GO_group0)
        mean_group1 = np.mean(GO_group1)
        dict_results[GOID]['mean_{}_tpm'.format(group0_name)] = mean_group0
        dict_results[GOID]['mean_{}_tpm'.format(group1_name)] = mean_group1

        try:
            log2fc = np.log2(mean_group0 / mean_group1)
        except:
            log2fc = np.nan
        dict_results[GOID]['log2fc'] = log2fc

    df_results = pd.DataFrame.from_dict(dict_results, orient='index')
    df_results = df_results.replace([np.inf, -np.inf], np.nan)
    df_results = df_results.fillna(0.0)

    df_results = do_FDR_correction(df_results)

    return df_results
def test_signficance_of_relationship(monkeypatch):
    monkeypatch.setattr(Processor,
                        'dimension_value_frame',
                        mock_dim_value_frame)

    mock_data = mock_dim_value_frame(None, None, None)
    exp_arr_1 = mock_data.ix[mock_data.dimension == 'D1', 'value']
    exp_arr_2 = mock_data.ix[mock_data.dimension == 'D2', 'value']
    exp = kruskalwallis(exp_arr_1, exp_arr_2)[1]

    p = Processor()
    a = Analyzer(processor=p)
    p_val = a.significance_of_relationship('D1', 'Q1', 'kruskalwallis')
    assert p_val == exp
Exemple #15
0
def kw_test(data):
    """
    data = [data2d_1,..,data2d_n]
    """
    n_pos = data[0].shape[1]
    p_values = np.zeros((n_pos,))

    for pos in range(n_pos):
        samples = [
            data2d[:, pos] for data2d in data
        ]
        h, p_values[pos] = kruskalwallis(*samples)

    return p_values
def scores(mmax, mbin, mtypes, mtype, m):
    for n in range(0, mmax):
        a = mtypes[0][n:n + mbin]
        b = array(mtypes[1][n:n + mbin])
        c = array(mtypes[2][n:n + mbin])
        d = array(mtypes[3][n:n + mbin])
        hstat, pval = s.kruskalwallis(a, b, c, d)
        if n < 100:
            print "{0}\t{1}".format(sorted(m[mtype])[n][0], pval)
            print "{0}\t{1}".format(sorted(m[mtype])[n][0] + 10, pval)
        elif n > 199:
            print "{0}\t{1}".format(sorted(m[mtype])[n][0] - 100, pval)
            print "{0}\t{1}".format(sorted(m[mtype])[n][0] - 90, pval)
        else:
            print "{0}\t{1}".format(sorted(m[mtype])[n][0], pval)
def non_par_test(data, clust_members):
    for i in range(data.shape[0] - 1):
        [test, p] = sci.kruskalwallis(
            list(data[clust_members['Cluster 0']].ix[i + 1]) +
            list(data[clust_members['Cluster 1']].ix[i + 1]),
            list(data[clust_members['Cluster 2']].ix[i + 1]),
            list(data[clust_members['Cluster 3']].ix[i + 1]))
        #list(data[clust_members['Cluster 4']].ix[i+1]),
        #list(data[clust_members['Cluster 5']].ix[i+1]),
        #list(data[clust_members['Cluster 6']].ix[i+1]))

        if p < 0.05:
            print('Lag: ' + str(i) + ' Significant')
        else:
            print('Lag: ' + str(i) + ' ---***---')
Exemple #18
0
def kruskalwallis_analysis(mvalues, fnames, fvalues):
    stats = []
    for fname, frow in zip(fnames, fvalues):
        try:
            lists = shatter(mvalues, frow)
            summary = {k: "%.4g" % (np.mean(v)) for k, v in lists.items()}
            summary = [":".join([k, v]) for k, v in summary.items()]
            summary = "|".join(summary)
            hstat, p = kruskalwallis(*lists.values())
            stats.append([fname, summary, p])
        except:
            sys.stderr.write(
                "NOTE: Unable to compute Kruskal-Wallis with feature: " +
                fname + "\n")
    return adjust_stats(stats)
Exemple #19
0
def kruskalTest(november):
    Col_1 = np.concatenate(november.select('rain_intensity').collect(), axis=0)
    print(Col_1)
    Col_2 = np.concatenate(november.select('internet_level').collect(), axis=0)
    print("Kruskal Wallis H-test test:")
    H, pval = mstats.kruskalwallis(Col_1, Col_2)
    print("H-statistic:", H)
    print("P-Value:", pval)
    if pval < 0.05:
        print(
            "Reject NULL hypothesis - Significant differences exist between groups."
        )
    if pval > 0.05:
        print(
            "Accept NULL hypothesis - No significant difference between groups."
        )
Exemple #20
0
def kruskalWallis(df, alpha):

    print(" Kruskal Wallis H-test test:")
    h = list(df.columns.values)

    for column in h[:-1]:
        # get the H and pval
        H, pval = mstats.kruskalwallis(df[column].tolist(),
                                       df["quality"].tolist())

        print " H-statistic:", H
        print " P-Value:", pval
        #check pvalue
        if pval < alpha:
            print "Reject NULL hypothesis - Significant differences exist between ", column, " and quality \n\n"
        if pval >= alpha:
            print "Accept NULL hypothesis - No significant difference between ", column, " and quality \n\n"
Exemple #21
0
def main():
    # Get the data
    city1 = array([68, 93, 123, 83, 108, 122])
    city2 = array([119, 116, 101, 103, 113, 84])
    city3 = array([70, 68, 54, 73, 81, 68])
    city4 = array([61, 54, 59, 67, 59, 70])
    
    # Perform the Kruskal-Wallis test
    h, p = kruskalwallis(city1, city2, city3, city4)
    
    # Print the results
    if p<0.05:
        print('There is a significant difference between the cities.')
    else:
        print('No significant difference between the cities.')
        
    return h
def get_test_kw_inner_text_length_y(data):

    # Test non parametric Kruskal-Wallis between inner_text_length and Y
    title = data[data['y'].apply(
        lambda x: True if x in "CEML__TITLE" else False)].inner_text_length
    price = data[data['y'].apply(
        lambda x: True if x in "CEML__PRICE" else False)].inner_text_length
    desc = data[data['y'].apply(lambda x: True if x in "CEML__DESCRIPTION" else
                                False)].inner_text_length
    list = data[data['y'].apply(lambda x: True
                                if x in "CEML__PAGE__DESCRIPTION__LIST__ITEMS"
                                else False)].inner_text_length
    noisy = data[data['y'].apply(lambda x: True
                                 if x in "NOISY" else False)].inner_text_length

    sample_size = round(len(noisy) / 2)
    title = np.random.choice(title, sample_size)
    desc = np.random.choice(desc, sample_size)
    list = np.random.choice(list, sample_size)
    price = np.random.choice(price, sample_size)
    noisy = np.random.choice(noisy, sample_size)

    M = np.transpose(np.array([title, price, desc, list, noisy]))
    M = pd.DataFrame(M,
                     columns=[
                         'CEML__TITLE', 'CEML__PRICE', 'CEML__DESCRIPTION',
                         'CEML__PAGE__DESCRIPTION__LIST__ITEMS', 'NOISY'
                     ])
    H, pval = mstats.kruskalwallis(
        M['CEML__TITLE'].tolist(), M['CEML__PRICE'].tolist(),
        M['CEML__DESCRIPTION'].tolist(),
        M['CEML__PAGE__DESCRIPTION__LIST__ITEMS'].tolist(),
        M['NOISY'].tolist())
    print("Test Kruskal-Wallis for inner_text_length grouped by y")
    print("H-statistic:", H)
    print("P-Value:", pval)
    if pval < 0.05:
        print(
            "Reject NULL hypothesis - Significant differences exist between groups."
        )
    if pval > 0.05:
        print(
            "Accept NULL hypothesis - No significant difference between groups."
        )

    return data
Exemple #23
0
def main():
    # Get the data
    city1 = array([68, 93, 123, 83, 108, 122])
    city2 = array([119, 116, 101, 103, 113, 84])
    city3 = array([70, 68, 54, 73, 81, 68])
    city4 = array([61, 54, 59, 67, 59, 70])

    # Perform the Kruskal-Wallis test
    h, p = kruskalwallis(city1, city2, city3, city4)

    # Print the results
    if p < 0.05:
        print('There is a significant difference between the cities.')
    else:
        print('No significant difference between the cities.')

    return h
def run_correlation(df, feature, outcome):
    # print("FEATURE",feature,"OUTCOME",outcome)
    # print(len(df.index))
    P_SIGNIFICANT = .05

    outcomes = set(df[outcome].tolist())
    n_outcomes = len(outcomes)
    # print("N OUTCOMES",n_outcomes)

    groups = []
    for oc in outcomes:
        groups += [df[df[outcome] == oc][feature].tolist()]

    are_norm = True
    for g in groups:
        # print(g,len(g))
        (s, p) = mstats.normaltest(g)
        are_norm = are_norm and (p > P_SIGNIFICANT)

    result = {}
    if are_norm:
        if n_outcomes <= 2:
            (s, p) = stats.ttest_ind(groups[0], groups[1])
            result['test'] = 't-test'
        else:
            (s, p) = stats.f_oneway(*groups)
            result['test'] = 'One-way ANOVA'
        result['statistic'] = s
        result['p'] = p
        for (n, g) in zip(range(len(groups)), groups):
            result['mean_%d' % n] = np.mean(g)
    else:
        if n_outcomes <= 2:
            (s, p) = stats.mannwhitneyu(groups[0], groups[1])
            result['test'] = 'Mann-Whitney'
        else:
            # print(len(groups),len(groups[0]))
            (s, p) = mstats.kruskalwallis(*groups)
            result['test'] = 'Kruskal-Wallis'
        result['statistic'] = s
        result['p'] = p
        for (n, g) in zip(range(len(groups)), groups):
            result['mean_%d' % n] = np.mean(g)

    return result
Exemple #25
0
def KWtest(Matrixs, Words, WordLists, option="CustomP", Low=0.0, High=1.0):
    # begin handle options
    MergeList = merge_list(WordLists)
    TotalWordCount = sum(MergeList.values())
    NumWord = len(MergeList)

    High, Low = wordfilter(option, Low, High, NumWord, TotalWordCount, MergeList)
    # end handle options

    Len = max(len(matrix) for matrix in Matrixs)
    # the length of all the sample set (all the sample set with less that this will turn into a masked array)

    word_pvalue_dict = {}  # the result list

    for i in range(1, len(Matrixs[0][0])):  # focusing on a specific word
        word = Words[i - 1]
        if Low < MergeList[word] < High:
            samples = []
            for k in range(len(Matrixs)):  # focusing on a group
                sample = []
                for j in range(len(Matrixs[k])):  # focusing on all the segment of that group
                    # add the sample into the sample list
                    sample.append(Matrixs[k][j][i])

                # combine all the samples of each sample list
                # turn the short ones masked so that all the sample set has the same length
                samples.append(
                    ma.masked_array(
                        sample + [0] * (Len - len(sample)), mask=[0] * len(sample) + [1] * (Len - len(sample))
                    )
                )

            # do the KW test
            try:
                pvalue = kruskalwallis(samples)[1]
            except ValueError as error:
                if error.args[0] == "All numbers are identical in kruskal":  # get the argument of the error
                    pvalue = "Invalid"
                else:
                    raise ValueError(error)

            # put the result in the dict
            word_pvalue_dict.update({word: pvalue})
    return sorted(word_pvalue_dict.items(), key=itemgetter(1))
Exemple #26
0
def kruskal_wallis(norm_df, metadata, groups):
    """ performes the kruskal wallis test and corrects the obtained p-values
    using Benjamini Hochberg FDR correction
    --------
    norm_df
        dataframe, normalized GCs
    metadata
        dict, {sample id: metadata}
    groups
        list, names of the inputted groups
    returns
    --------
    fdr_df = dataframe, contains the adjusted P-values for the GCs
    """
    gc_groups = {}
    p_values = []

    group1 = groups[0]
    group2 = groups[1]
    row = pd.Series(metadata, name="Metadata")
    df = norm_df.append(row).sort_values(by=["Metadata"], axis=1)
    df = df.replace(0, float(0.0)).T
    df = df.loc[df["Metadata"].isin(groups)]

    for gc_name in df.columns:
        if "GC_DNA--" in gc_name:  # filter out the housekeeping genes
            gc_groups[gc_name] = {}
            for grp in df['Metadata'].unique():
                # make arrays of the groups per GC {GC: {group1: array, group2: array}}
                gc_groups[gc_name][grp] = df[gc_name][df['Metadata'] ==
                                                      grp].values
    # perform Kruskal Wallis test
    for gc in gc_groups.keys():
        no, pval = mstats.kruskalwallis(gc_groups[gc][group1],
                                        gc_groups[gc][group2])
        p_values.append(pval)
    fdr = fdrcorrection(p_values, alpha=0.05, method="i")
    fdr_df = pd.DataFrame(data=fdr,
                          columns=gc_groups.keys(),
                          index=["T/F", "pval"]).T
    return fdr_df
Exemple #27
0
def main():
    '''These data could be a comparison of the smog levels in four different cities. '''
    
    # Get the data
    city1 = np.array([68, 93, 123, 83, 108, 122])
    city2 = np.array([119, 116, 101, 103, 113, 84])
    city3 = np.array([70, 68, 54, 73, 81, 68])
    city4 = np.array([61, 54, 59, 67, 59, 70])
    
    # --- >>> START stats <<< ---
    # Perform the Kruskal-Wallis test
    h, p = kruskalwallis(city1, city2, city3, city4)
    # --- >>> STOP stats <<< ---
    
    # Print the results
    if p<0.05:
        print('There is a significant difference between the cities.')
    else:
        print('No significant difference between the cities.')
        
    return h
Exemple #28
0
def main():
    '''These data could be a comparison of the smog levels in four different cities. '''
    
    # Get the data
    city1 = np.array([68, 93, 123, 83, 108, 122])
    city2 = np.array([119, 116, 101, 103, 113, 84])
    city3 = np.array([70, 68, 54, 73, 81, 68])
    city4 = np.array([61, 54, 59, 67, 59, 70])
    
    # --- >>> START stats <<< ---
    # Perform the Kruskal-Wallis test
    h, p = kruskalwallis(city1, city2, city3, city4)
    # --- >>> STOP stats <<< ---
    
    # Print the results
    if p<0.05:
        print('There is a significant difference between the cities.')
    else:
        print('No significant difference between the cities.')
        
    return h
Exemple #29
0
    def do_kruskal(self, region, depth, year, path, prefix):
        """
        apply a kruskal wallis for a given year, region and depth  
        """

        # get file name for the clustered file
        name_clst = path + prefix + '_gaModel_' + region + '_' + str(depth) + '_' + str(year) + '.txt' 
        # get file name for the not declustered file
        name_no_clst = path + 'gaModel_' + region + '_' + str(depth) + '_' + str(year) + '.txt' 
        
        # get file pointer for the clustered_file
        fp_clst = open(name_clst, 'r')
        # get file pointer for the clustered_file
        fp_no_clst = open(name_no_clst, 'r')
        
        # get list of fitness of individuals for clustering 
        fit_clst = []
        for line in fp_clst: 
            fit_clst.append(float(line))

        # get list of fitness of individuals for clustering 
        fit_no_clst = []
        for line in fp_no_clst:
            fit_no_clst.append(float(line))

        # do kruskal wallis test 
        try:
            result = kruskalwallis(fit_no_clst, fit_clst)
            p_value = result[1]
        except ValueError:
            print("Todos os numeros do teste de Kruskal sao iguais, a funcao retorna um erro")
            p_value = None

        print("Mean of the undeclustered: " + str(statistics.mean(fit_no_clst)))
        print("Mean of the declustered: " + str(statistics.mean(fit_clst)))
        print("p-value for the kruskal-wallis: " + str(p_value))

        # close files
        fp_clst.close()
        fp_no_clst.close()
Exemple #30
0
def KruskalTest(Type='NbComments'):
    if Type == 'NbComments':
        Groups, NbComments = Luxury_vs_NonLuxury(False)

        df = pd.DataFrame({'Groups': Groups, 'NbComments': NbComments})
        df['Groups'].replace({'Luxary': 1, 'NonLuxuary': 2}, inplace=True)

        Col_1 = df['NbComments'].tolist()
        Col_2 = df['Groups'].tolist()
    else:
        Groups, NbComments, Sentiments = Luxury_vs_NonLuxury(True)
        SGroups = []
        for i in range(0, len(Groups)):
            SGroups.extend(repeat(Groups[i], len(Sentiments[i])))
        GSentiments = [
            float(item) for sublist in Sentiments for item in sublist
        ]
        df = pd.DataFrame({'Groups': SGroups, 'Sentiments': GSentiments})
        df['Groups'].replace({'Luxary': 1, 'NonLuxuary': 2}, inplace=True)

        Col_1 = df['Sentiments'].tolist()
        Col_2 = df['Groups'].tolist()
    print("Kruskal Wallis H-test " + Type + " test:")

    H, pval = mstats.kruskalwallis(Col_1, Col_2)

    print("H-statistic:", H)
    print("P-Value:", pval)

    if pval < 0.05:
        print(
            "Reject NULL hypothesis - Significant differences exist between groups."
        )
    if pval > 0.05:
        print(
            "Accept NULL hypothesis - No significant difference between groups."
        )

    return df
Exemple #31
0
def kw_test_for_means(current_climate = True, data_folder = 'data/streamflows/hydrosheds_euler9', months = list(range(1,13))):
    """
    returns p-values resulting from kruskal - wallis test on annual means
    """

    the_ids = members.all_current if current_climate else members.all_future

    file_paths = []
    for the_file in os.listdir(data_folder):
        if the_file.split("_")[0] in the_ids:
            file_paths.append(os.path.join(data_folder, the_file))

    real_means = []
    for the_path in file_paths:
        streamflow, times, i_indices, j_indices = data_select.get_data_from_file(the_path)

        #for each year and for each gridcell get mean value for the period
        means_dict = data_select.get_means_over_months_for_each_year(times, streamflow, months = months)

        means_sorted_in_time = [x[1] for x in sorted(list(means_dict.items()), key=lambda x: x[0])]
        data_matrix = np.array(means_sorted_in_time)
        real_means.append(data_matrix) #save modelled means
        #print "data_matrix.shape = ", data_matrix.shape

    n_positions = real_means[0].shape[1]
    p_values = np.zeros((n_positions,))
    for pos in range(n_positions):
        samples = [
            data2d[:, pos] for data2d in real_means
        ]

        #x = list(samples)
        #print len(x), x[0].shape


        h, p_values[pos] = kruskalwallis(*samples)
    return p_values

    pass
def plot_var_dist(plotargs, kkey='IPO_duration', kw_xy=(20,20)):

    f, ax = plt.subplots(1,1, figsize=(12, 4), sharex=True)

    for arg in plotargs:
        df, label, color, xshift, yshift = arg
        color = sb.color_palette("muted")[color]
        label += " Obs={}".format(len(df))

        # Summary stats:
        mean = df[kkey].mean()
        mode = df[kkey].mode()
        med  = df[kkey].median()
        std  = df[kkey].std()
        skew = df[kkey].skew()
        stat = u"\nμ={:0.2f}  med={:0.2f}\nσ={:0.2f}  skew={:0.2f}".format(
                mean, med, std, skew)

        yvals, xvals, patchs = plt.hist(df[kkey].tolist(), bins=36, label=label,
                                color=color, alpha=0.6, histtype='stepfilled')

        coords = list(zip(yvals,xvals))
        coords.sort()
        y,x = coords[-3]

        ax.annotate(stat,
                    xy=(x, y),
                    xytext=(x*xshift, y*yshift),
                    arrowprops=dict(facecolor=color,
                                    width=1.6,
                                    headwidth=1.6))

    H, prob = kruskalwallis(*[x[0][kkey] for x in plotargs])
    # U, prob = mannwhitneyu(*[x[0][kkey] for x in plotargs])
    ax.annotate("Kruskal-Wallis: (H={H:.2f}, prob={p:.3f})".format(H=H, p=prob),
                xy=(kw_xy[0], kw_xy[1]))
    plt.ylabel("Frequency")
    plt.legend()
Exemple #33
0
def check_kw(resid_4d): 
    """
    Kruskal-Wallis tests the null hypothesis that the population 
    median of all of the groups are equal. In particular, this 
    function performs a Kruskal-Wallis test for each voxel's 
    residuals against a sample from the normal distribution. 

    Parameters
    ---------
    resid_4d: residual data of 4D numpy array
    
    Returns
    -------
    kw_normality: p-value from Kruskal-Wallis normality test
    
    """
    kw_3d = np.zeros(resid_4d.shape[:-1])
    for i in range(resid_4d.shape[0]):
        for j in range(resid_4d.shape[1]):
            for k in range(resid_4d.shape[2]):
                norm_samp = np.random.normal(np.mean(resid_4d[i,j,k,:]), np.std(resid_4d[i,j,k,:]), resid_4d.shape[-1])
                junk, kw_3d[i,j,k] = kruskalwallis(resid_4d[i,j,k,:], norm_samp)
    return kw_3d
def run_correlation(df):
    global FEATURES
    global P_SIGNIFICANT

    results = []

    for intention in INTENTION_COLUMNS:
        for feature in FEATURES:
            res = {'feature': feature, 'intention': intention}
            group1 = df[df["intent_current_" +
                           intention] == 0][feature].tolist()
            group2 = df[df["intent_current_" +
                           intention] == 1][feature].tolist()

            are_norm = True
            (s, p) = mstats.normaltest(group1)
            are_norm = are_norm and (p > P_SIGNIFICANT)
            (s, p) = mstats.normaltest(group2)
            are_norm = are_norm and (p > P_SIGNIFICANT)
            if are_norm:
                (s, p) = stats.f_oneway(group1, group2)
                res['test'] = 'One-way ANOVA'
                res['statistic'] = s
                res['p'] = p
                res['mean_0'] = np.mean(group1)
                res['mean_1'] = np.mean(group2)
            else:
                (s, p) = mstats.kruskalwallis(group1, group2)
                res['test'] = 'Kruskal-Wallis'
                res['statistic'] = s
                res['p'] = p
                res['mean_0'] = np.mean(group1)
                res['mean_1'] = np.mean(group2)

            results += [res]
    return results
Exemple #35
0
    def do_kruskal_old(self, region, year, depth, logbooks, subdir):
        """
        do a kruskal-wallis test between two logbook files - the year, region and depth must be the same   
        """
        # defines, constant to improve legibility          
        NRUNS = 10 #number of times we ran a gamodel simulation
        BEST = 4 # index of the column that contain the best individual in the catalog 
        NGEN = 99 # number of the last generation for the gamodel 
        NO_DECLUSTER = 0 # logbook index for the undeclustered catalog
        DECLUSTER = 1 # logbook index for the declustered catalog 

        # open both files read only 
        NAME_NO_DECLUSTER = '../catalogs/' + logbooks[NO_DECLUSTER] + subdir + '/' + region + \
                     '_' + year + '_' + depth + '_logbook.txt' 
        log_no_decluster = open(NAME_NO_DECLUSTER, 'r') 

        NAME_DECLUSTER = '../catalogs/' + logbooks[DECLUSTER] + subdir + '/' + region + \
                     '_' + year + '_' + depth + '_logbook.txt' 
        log_decluster = open(NAME_DECLUSTER, 'r') 

        # create list for the best value of individuals 
        best_no_decluster = []
        best_decluster = []

        # iterate through the first catalog, registering the solution for the best individual in each gamodel run
        cur_gen = 0 
        for line in log_no_decluster:
            if cur_gen != NGEN:
                cur_gen += 1
                continue 
            else:
                cur_gen = 0 # start all over 
                contents = line.split()
                best_no_decluster.append(float(contents[BEST]))

        # do the same for the second catalog
        cur_gen = 0 
        for line in log_decluster:
            if cur_gen != NGEN:
                cur_gen += 1
                continue 
            else:
                cur_gen = 0 
                contents = line.split()
                best_decluster.append(float(contents[BEST]))

        # header
        print("####region: " + str(region) + "  year: " + str(year) + "  depth: " + str(depth))
        
        # perform a kruskal-wallis simulation 
        try:
            result = kruskalwallis(best_no_decluster, best_decluster)
            p_value = result[1]
        except ValueError:
            print("Todos os numeros do teste de Kruskal sao iguais, a funcao retorna um erro")
            p_value = None

        # print the result - Redirect this to a file if you want
        print("Mean of the undeclustered: " + str(statistics.mean(best_no_decluster)))
        print("Mean of the declustered: " + str(statistics.mean(best_decluster)))
        print("p-value for the kruskal-wallis: " + str(p_value))
        print(best_decluster)
        print(best_no_decluster)
        print("\n\n\n")
        input() 
        # close open file pointer
        log_decluster.close()
        log_no_decluster.close()
Exemple #36
0
 def kwtest(s, groupby, df):
     return kruskalwallis(*[group[s] for group in stratified(groupby, df)])
    def performancesForThreadingTask(self):
        if not self.perfsCalculated:
            self.calculatePerformances()

        durations = []
        angularDists = []
        speeds = []
        averageAccels = []
        smoothnesses = []
        handednesses = []
        speedVariances = []
        ambidexterities = []
        significances = []
        perfs = []
        for perf in self.novicePerfs + self.intermediatePerfs + self.expertPerfs:
            durations.append(perf["duration"])
            angularDists.append(perf["angularDist"])
            speeds.append(perf["averageSpeed"])
            averageAccels.append(perf["averageAccel"])
            smoothnesses.append(perf["motionSmoothness"])
            handednesses.append(perf["handedness"])
            speedVariances.append(perf["speedVariance"])
            ambidexterities.append(perf["ambidextricity"][0])
            significances.append(perf["ambidextricity"][1])
            perfs.append(perf["perf"])

        ticks = (
            ["N" + str(i) for i in range(1, 11)]
            + ["I" + str(i) for i in range(1, 11)]
            + ["E" + str(i) for i in range(1, 11)]
        )
        colours = ["r"] * 10 + ["g"] * 10 + ["b"] * 10

        # Kruskal-Wallis tests (Non-parametric ANOVAS)

        # Between novices, intermediates and experts
        anova_durations = kruskalwallis(durations[0:10], durations[10:20], durations[20:30])
        anova_distances = kruskalwallis(angularDists[0:10], angularDists[10:20], angularDists[20:30])
        anova_speeds = kruskalwallis(speeds[0:10], speeds[10:20], speeds[20:30])
        anova_accels = kruskalwallis(averageAccels[0:10], averageAccels[10:20], averageAccels[20:30])
        anova_smoothness = kruskalwallis(smoothnesses[0:10], smoothnesses[10:20], smoothnesses[20:30])
        anova_handedness = kruskalwallis(handednesses[0:10], handednesses[10:20], handednesses[20:30])
        anova_variances = kruskalwallis(speedVariances[0:10], speedVariances[10:20], speedVariances[20:30])
        anova_ambidexterities = kruskalwallis(ambidexterities[0:10], ambidexterities[10:20], ambidexterities[20:30])
        anova_perfs = kruskalwallis(perfs[0:10], perfs[10:20], perfs[20:30])

        # Between experts and non-experts
        anova_two_durations = mannwhitneyu(durations[0:20], durations[20:30])
        anova_two_distances = mannwhitneyu(angularDists[0:20], angularDists[20:30])
        anova_two_speeds = mannwhitneyu(speeds[0:20], speeds[20:30])
        anova_two_accels = mannwhitneyu(averageAccels[0:20], averageAccels[20:30])
        # anova_two_smoothness = mannwhitneyu(smoothnesses[0:10]+smoothnesses[10:20],smoothnesses[20:30])
        anova_two_smoothness = mannwhitneyu(smoothnesses[0:20], smoothnesses[20:30])
        anova_two_handedness = mannwhitneyu(handednesses[0:20], handednesses[20:30])
        anova_two_variances = mannwhitneyu(speedVariances[0:20], speedVariances[20:30])
        anova_two_ambidexterities = mannwhitneyu(ambidexterities[0:20], ambidexterities[20:30])
        anova_two_perfs = mannwhitneyu(perfs[0:20], perfs[20:30])

        # SCATTER PLOTS
        def save_scatter(data, colours, title, ylabel, savePath, yLimTuple=None):
            fig = plt.figure()
            ax = fig.add_subplot(111)
            plt.title(title)
            plt.xlabel("Trials")
            plt.ylabel(ylabel)

            # Sort by value
            data, colours = zip(*sorted(zip(data, colours)))

            nov_data = [(i, d) for (i, d, c) in zip(range(len(data)), data, colours) if c == "r"]
            inter_data = [(i, d) for (i, d, c) in zip(range(len(data)), data, colours) if c == "g"]
            exp_data = [(i, d) for (i, d, c) in zip(range(len(data)), data, colours) if c == "b"]

            nov = ax.scatter(zip(*nov_data)[0], zip(*nov_data)[1], color="r", marker="o", s=60)
            inter = ax.scatter(zip(*inter_data)[0], zip(*inter_data)[1], color="g", marker="^", s=60)
            exp = ax.scatter(zip(*exp_data)[0], zip(*exp_data)[1], color="b", marker="*", s=60)

            plt.legend((nov, inter, exp), ["Novice", "Intermediate", "Expert"], loc=2)
            plt.xticks([])
            plt.gca().set_xlim(-1, len(data))
            if yLimTuple:
                plt.gca().set_xlim(yLimTuple)
            plt.tight_layout()
            with open(savePath, "w") as figOut:
                plt.savefig(figOut)

        save_scatter(
            durations,
            colours,
            "Task Duration",
            "Time (seconds)",
            "/Users/robertevans/repos/minf/keyhole_graphs/durations.png",
        )
        save_scatter(
            angularDists,
            colours,
            "Total Angular Distance",
            "Rotation (radians)",
            "/Users/robertevans/repos/minf/keyhole_graphs/distances.png",
        )
        save_scatter(
            speeds,
            colours,
            "Average Speed",
            "Speed (radians/second)",
            "/Users/robertevans/repos/minf/keyhole_graphs/speeds.png",
        )
        save_scatter(
            averageAccels,
            colours,
            "Average Acceleration",
            "Acceleration (radians/second$^2$)",
            "/Users/robertevans/repos/minf/keyhole_graphs/accels.png",
        )
        save_scatter(
            smoothnesses,
            colours,
            "Motion Smoothness",
            "Smoothness (radians/second$^3$)",
            "/Users/robertevans/repos/minf/keyhole_graphs/smoothnesses.png",
        )
        save_scatter(
            handednesses,
            colours,
            "Handedness",
            "Right distance minus left distance per frame (radians)",
            "/Users/robertevans/repos/minf/keyhole_graphs/handednesses.png",
        )
        save_scatter(
            speedVariances,
            colours,
            "Variance of Angular Speed",
            "Variance (radians/second)",
            "/Users/robertevans/repos/minf/keyhole_graphs/variances.png",
        )
        save_scatter(
            ambidexterities,
            colours,
            "Ambidexterity",
            "Spearman correlation for left/right speeds (per frame)",
            "/Users/robertevans/repos/minf/keyhole_graphs/ambidexterities.png",
        )
        save_scatter(
            perfs,
            colours,
            "Total Task Performance",
            "Score (radians$^{-1}$seconds$^{-1}$)",
            "/Users/robertevans/repos/minf/keyhole_graphs/scores.png",
        )

        # BOX PLOTS
        def save_box_plot(data, key, p_value, savePath, title, ylabel):
            plt.figure()
            plt.title("{0} - p-value: {1:.3g}".format(title, p_value))
            plt.ylabel(ylabel)
            plt.boxplot(data)
            plt.xticks(range(1, len(data) + 1), key)
            plt.tight_layout()
            with open(savePath, "w") as figOut:
                plt.savefig(figOut)

        save_box_plot(
            [durations[:10], durations[10:20], durations[20:]],
            ("Novices", "Intermediates", "Experts"),
            anova_durations[1],
            "/Users/robertevans/repos/minf/keyhole_graphs/durations_box_three.png",
            "Duration",
            "Time (seconds)",
        )
        save_box_plot(
            [angularDists[:10], angularDists[10:20], angularDists[20:]],
            ("Novices", "Intermediates", "Experts"),
            anova_distances[1],
            "/Users/robertevans/repos/minf/keyhole_graphs/distances_box_three.png",
            "Distance",
            "Rotation (radians)",
        )
        save_box_plot(
            [speeds[:10], speeds[10:20], speeds[20:]],
            ("Novices", "Intermediates", "Experts"),
            anova_speeds[1],
            "/Users/robertevans/repos/minf/keyhole_graphs/speeds_box_three.png",
            "Speed",
            "Speed (radians/second)",
        )
        save_box_plot(
            [averageAccels[:10], averageAccels[10:20], averageAccels[20:]],
            ("Novices", "Intermediates", "Experts"),
            anova_accels[1],
            "/Users/robertevans/repos/minf/keyhole_graphs/accels_box_three.png",
            "Acceleration",
            "Acceleration (radians/second$^2$)",
        )
        save_box_plot(
            [smoothnesses[0:10], smoothnesses[10:20], smoothnesses[20:]],
            ("Novices", "Intermediates", "Experts"),
            anova_smoothness[1],
            "/Users/robertevans/repos/minf/keyhole_graphs/smoothnesses_box_three.png",
            "Smoothness",
            "Smoothness (radians/second$^3$)",
        )
        save_box_plot(
            [handednesses[:10], handednesses[10:20], handednesses[20:]],
            ("Novices", "Intermediates", "Experts"),
            anova_handedness[1],
            "/Users/robertevans/repos/minf/keyhole_graphs/handednesses_box_three.png",
            "Handedness",
            "Right distance minus left distance per frame (radians)",
        )
        save_box_plot(
            [speedVariances[:10], speedVariances[10:20], speedVariances[20:]],
            ("Novices", "Intermediates", "Experts"),
            anova_variances[1],
            "/Users/robertevans/repos/minf/keyhole_graphs/variances_box_three.png",
            "Speed Variance",
            "Variance (radians/second)",
        )
        save_box_plot(
            [ambidexterities[:10], ambidexterities[10:20], ambidexterities[20:]],
            ("Novices", "Intermediates", "Experts"),
            anova_ambidexterities[1],
            "/Users/robertevans/repos/minf/keyhole_graphs/ambidexterities_box_three.png",
            "Ambidexterity",
            "Spearman correlation for left/right speeds (per frame)",
        )
        save_box_plot(
            [perfs[:10], perfs[10:20], perfs[20:]],
            ("Novices", "Intermediates", "Experts"),
            anova_perfs[1],
            "/Users/robertevans/repos/minf/keyhole_graphs/scores_box_three.png",
            "Performance",
            "Score (radians$^{-1}$seconds$^{-1}$)",
        )

        save_box_plot(
            [durations[:20], durations[20:]],
            ("Non-Experts", "Experts"),
            anova_two_durations[1] * 2,
            "/Users/robertevans/repos/minf/keyhole_graphs/durations_box_two.png",
            "Duration",
            "Time (seconds)",
        )
        save_box_plot(
            [angularDists[:20], angularDists[20:]],
            ("Non-Experts", "Experts"),
            anova_two_distances[1] * 2,
            "/Users/robertevans/repos/minf/keyhole_graphs/distances_box_two.png",
            "Distance",
            "Rotation (radians)",
        )
        save_box_plot(
            [speeds[:20], speeds[20:]],
            ("Non-Experts", "Experts"),
            anova_two_speeds[1] * 2,
            "/Users/robertevans/repos/minf/keyhole_graphs/speeds_box_two.png",
            "Speed",
            "Speed (radians/second)",
        )
        save_box_plot(
            [averageAccels[:20], averageAccels[20:]],
            ("Non-Experts", "Experts"),
            anova_two_accels[1] * 2,
            "/Users/robertevans/repos/minf/keyhole_graphs/accels_box_two.png",
            "Acceleration",
            "Acceleration (radians/second$^2$)",
        )
        save_box_plot(
            [smoothnesses[0:10] + smoothnesses[10:20], smoothnesses[20:]],
            ("Non-Experts", "Experts"),
            anova_two_smoothness[1] * 2,
            "/Users/robertevans/repos/minf/keyhole_graphs/smoothnesses_box_two.png",
            "Smoothness",
            "Smoothness (radians/second$^3$)",
        )
        save_box_plot(
            [handednesses[:20], handednesses[20:]],
            ("Non-Experts", "Experts"),
            anova_two_handedness[1] * 2,
            "/Users/robertevans/repos/minf/keyhole_graphs/handednesses_box_two.png",
            "Handedness",
            "Right distance minus left distance per frame (radians)",
        )
        save_box_plot(
            [speedVariances[:20], speedVariances[20:]],
            ("Non-Experts", "Experts"),
            anova_two_variances[1] * 2,
            "/Users/robertevans/repos/minf/keyhole_graphs/variances_box_two.png",
            "Speed Variance",
            "Variance (radians/second)",
        )
        save_box_plot(
            [ambidexterities[:20], ambidexterities[20:]],
            ("Non-Experts", "Experts"),
            anova_two_ambidexterities[1] * 2,
            "/Users/robertevans/repos/minf/keyhole_graphs/ambidexterities_box_two.png",
            "Ambidexterity",
            "Spearman correlation for left/right speeds (per frame)",
        )
        save_box_plot(
            [perfs[:20], perfs[20:]],
            ("Non-Experts", "Experts"),
            anova_two_perfs[1] * 2,
            "/Users/robertevans/repos/minf/keyhole_graphs/scores_box_two.png",
            "Performance",
            "Score (radians$^{-1}$seconds$^{-1}$)",
        )

        plt.show()
Exemple #38
0
from os import path
from collections import defaultdict
from util import pretty_name, median_deviation
from scipy.stats.mstats import kruskalwallis

if __name__ == '__main__':
    # Run through all of the files gathering different seeds into lists
    statify = defaultdict(list)
    active = defaultdict(list)
    filecount = 0
    for filename in sys.argv[1:]:
        base = path.basename(filename)
        try:
            problem, nodes, version, seed = base.split('_')
            with open(filename, 'r') as f:
                data = json.load(f)
            statify[version].append(data[1]['evals'])
            active[version].append(data[1]['phenotype'])
            filecount += 1
        except ValueError:
            print filename, "FAILED"

    print 'Files Successfully Loaded', filecount
    print 'Kruskal Wallis', kruskalwallis(statify.values())
    for version, data in statify.iteritems():
        print '--------- %s ---------' % pretty_name[version]
        print "MES, MAD", median_deviation(data)
        print 'Active', median_deviation(active[version])
        print 'Mann Whitney U against Normal',
        print stats.mannwhitneyu(statify['normal'], data)
Exemple #39
0
                se.dzs_mean + se.dzs_std,
                facecolor='r',
                alpha=alphafill,
                lw=0.01)

intarr = N.concatenate(tuple([dz for dz in interior.normdz]),
                       axis=2).reshape(len(interior.normdz), 100)
scarr = N.concatenate(tuple([dz for dz in sc.normdz]),
                      axis=2).reshape(len(sc.normdz), 100)
searr = N.concatenate(tuple([dz for dz in se.normdz]),
                      axis=2).reshape(len(se.normdz), 100)
sarr = N.vstack((scarr, searr))

kw = []
for i in xrange(100):
    kw.append(kruskalwallis(intarr[:, i], sarr[:, i])[1])

#ax.plot(interior.norme,inarr[0,:],'--k')

plt.legend([l1, l2, l3], [
    'Interior N=%s' % str(len(interior.name)),
    'South Central N=%s' % str(len(sc.name)),
    'Southeast N=%s' % str(len(se.name))
],
           loc='lower left',
           fontsize=9)

font = matplotlib.font_manager.FontProperties(family='Arial',
                                              weight='bold',
                                              size=15)
#ax.annotate('A',[0.07,0.5],horizontalalignment='center',verticalalignment='center',fontproperties=font)
Exemple #40
0
# -------------------------------------------------------------------
# 1.caculate the square differenz between two vectors
sq_diff_ab = np.square(mean_vector_amp - mean_vector_bp)
sse_ab = np.sum(sq_diff_ab)
norm_ab = np.sqrt(sse_ab)
print('the L2-Norm is %.2f' % norm_ab)

# 2.threshold and ratio
counter = 0
threshold = 0.01
print('the threshold is %.2f%%' % (threshold * 100))
for i in range(num_elements):
    diff = np.abs(mean_vector_amp[0][i] - mean_vector_bp[0][i])
    if diff <= threshold:
        counter += 1
ratio = float(counter) / num_elements
print('the ratio is %.2f%%' % (ratio * 100))

# 3.caculate the correlation between two vectors
cocoef_matrix = np.corrcoef(mean_array_amp, mean_array_bp)
cocoef = cocoef_matrix[0, 1]
print('the correlation coefficient is %0.3f' % cocoef)

# 4.kruskalwallis test for median difference between two distribution
H, pvalue = kruskalwallis(mean_vector_amp[0], mean_vector_bp[0])
print('the p-value is %.2f' % pvalue)

if pvalue > 0.05:
    print("accept null hypothesis: no significant difference between two groups")
# -------------------------------------------------------------------
def FindDwellTimes(concat_table):
    vertical_stats = { 'i' : { 'on_dwell': [], 'off_dwell':[] ,\
                  'on_count':0.0, 'off_count':0.0, \
                  'pos_dwell' : {0:[],1:[], 2:[], 3:[], 4:[]},\
                  'all_clicks':[], \
                  'clicks':{0:0.0,1:0.0, 2: 0.0, 3:0.0, 4:0.0}}, \
            'w' : { 'on_dwell': [], 'off_dwell':[],'on_count':0.0,\
                    'off_count':0.0 ,'all_clicks':[],\
                    'pos_dwell' : {0:[],1:[], 2:[], 3:[], 4:[]} ,\
                    'clicks':{0:0.0,1:0.0, 2: 0.0, 3:0.0, 4:0.0}}, \
            'o' : { 'on_dwell': [], 'off_dwell':[],'on_count':0.0,\
                    'off_count':0.0,'pos_dwell' : {0:[],1:[], 2:[], 3:[], 4:[]},\
                    'all_clicks':[],\
                    'clicks':{0:0.0,1:0.0, 2: 0.0, 3:0.0, 4:0.0}  }, \
            'v' : { 'on_dwell': [], 'off_dwell':[],'on_count':0.0,\
                    'off_count':0.0 ,'all_clicks':[], \
                    'pos_dwell' : {0:[],1:[], 2:[], 3:[], 4:[]},\
                    'clicks':{0:0.0,1:0.0, 2: 0.0, 3:0.0, 4:0.0},  \
                  } \
    }

    # Group by task_id and query_id and Sort by time within each group.
    grouped_table = concat_table.groupby(['user_id','task_id'])
    for name, group in grouped_table:
        group = group.sort('time')
        rows = []
        results = {}
        serp_clicks = 0
        vert_type = None
        recorded_clicks = {}

        for index, row in group.iterrows():
            rows.append(row)
        for i in range(len(rows)):
            row = rows[i]
            # Store results.
            if row['type'] == 'results':
                results[row['doc_pos']] = row

            if row['type'] == 'results' and row['doc_pos'] == 0:
                # For each page find time it was tapped or clicked. 
                # Take the min for dwell time.
                vtype = None
                for curl, stats in recorded_clicks.items():
                    vtype = stats['type']
                    if stats['rank'] < 5:
                        vertical_stats[vtype]['pos_dwell'][stats['rank']].append(min(stats['time']))
                        vertical_stats[vtype]['clicks'][stats['rank']]+=1
                    if stats['rank'] == 0:
                        vertical_stats[vtype]['on_dwell'].append(min(stats['time']))
                        vertical_stats[vtype]['on_count']+=1.0
                    else:
                        vertical_stats[vtype]['off_dwell'].append(min(stats['time']))
                        vertical_stats[vtype]['off_count']+=1.0
                    if vtype and serp_clicks > 0:
                      vertical_stats[vtype]['all_clicks'].append(serp_clicks)
                recorded_clicks = {}
                serp_clicks = 0
                vert_type = str(row['doc_type']).strip()

            # Found a tap or a click
            start_time  = row['time']
            end_time = None
            found = False
            click_rank = None
            click_url = None
            if (row['type'] == 'event' and 'tap' in row['event_type'] and\
                row['element'] > -1) or (row['type'] == 'click') :
                if row['type'] == 'event':
                    click_url = results[row['element']]['doc_url']
                    click_rank = int(row['element'])

                if (row['type'] == 'click'):
                    click_rank = int(row['doc_id'][row['doc_id'].find('_')+1:])
                    click_url = row['doc_url']
              
                # Check if page response for this url has been submitted.
                j = i+1
                while (j < len(rows)) and (not (rows[j]['type'] == 'results')):
                    if rows[j]['type'] == 'page_response':
                        if (rows[j]['doc_url'] in click_url) or\
                        editdistance.eval(click_url, rows[j]['doc_url']) < 20:
                            found = True
                            end_time = rows[j]['time']
                            break
                    if rows[j]['type'] == 'event' and 'tap' not in rows[j]['event_type']:
                        found = True
                        end_time = rows[j]['time']
                        break
                        
                    if rows[j]['type'] == 'task_response':
                        # user did not provide page or serp feedback :(
                        found = True
                        end_time = rows[j]['time']
                        break
                    if found :
                        break
                    j+=1
                    
                if found and end_time:
                    if click_url not in recorded_clicks:
                        recorded_clicks[click_url] ={'rank':None, 'type':None,'time':[]}
                    recorded_clicks[click_url]['time'].append((end_time-start_time).total_seconds())
                    recorded_clicks[click_url]['rank']= click_rank
                    recorded_clicks[click_url]['type']= vert_type
                    serp_clicks = len(recorded_clicks)
                else:
                    print 'Cannot find in responses', click_url,\
                        row['user_id'], row['task_id'], row['type']
        ''' 
        vtype = None
        for curl, stats in recorded_clicks.items():
            vtype = stats['type']
            if stats['rank'] < 5:
                vertical_stats[vtype]['pos_dwell'][stats['rank']].append(min(stats['time']))
                vertical_stats[vtype]['clicks'][stats['rank']]+=1
            if stats['rank'] == 0:
                vertical_stats[vtype]['on_dwell'].append(min(stats['time']))
                vertical_stats[vtype]['on_count']+=1.0
            else:
                vertical_stats[vtype]['off_dwell'].append(min(stats['time']))
                vertical_stats[vtype]['off_count']+=1.0
            if vtype and serp_clicks > 0:
              vertical_stats[vtype]['all_clicks'].append(serp_clicks)
        '''

    for vertical, val_dict in vertical_stats.items():
        print vertical, 'on-dwell',np.mean(val_dict['on_dwell']),\
        np.std(val_dict['on_dwell']),'off-dwell', np.mean(val_dict['off_dwell']), \
        np.std(val_dict['off_dwell']), val_dict['on_count'],\
        val_dict['off_count']
 
    verticals = ['i','o','w','v']
    for i in range(len(verticals)):
      v1 = verticals[i]
      for j in range(i+1, len(verticals)):
        v2 = verticals[j]
        for attribute in ['on_dwell','off_dwell', 'all_clicks']:
          print 'Krusk wallis',v1, v2,attribute, \
          kruskalwallis(vertical_stats[v1][attribute], \
          vertical_stats[v2][attribute])


    for vert_type, stats in vertical_stats.items():
        for pos , array in stats['pos_dwell'].items():
            print 'Man pos dwell ',vert_type, pos, np.median(array), \
            kruskalwallis(array,vertical_stats['o']['pos_dwell'][pos])
        for pos in stats['clicks'].keys():
            vertical_stats[vert_type]['clicks'][pos]/= (stats['on_count']+\
                stats['off_count']) 

    PlotClickDist(vertical_stats)
def FindPageMetricsPerVertical(result_table, page_table):

    # Concat result and page tables
    result_table['type'] = 'results'
    page_table['type'] = 'page_response'
    concat_table = pd.concat([result_table, page_table], ignore_index = True)
    concat_table.to_csv('concat_result_page',encoding='utf-8', index = False)

    # Group by user_id and task_id and sort by time within each group
    grouped_table = concat_table.sort(['time']).groupby(['user_id','task_id'])
    # Set vertical type for each serp. 
    vert_type = None
    concat_table['first_result_type'] = ''

    # Iterate over all users and tasks
    for name, group in grouped_table:

        # Iterate over all page response results
        # for a specific user and a task
        for pindex, prow in group.iterrows():

            if prow['type'] == 'results' and prow['doc_pos'] == 0:
                vert_type = prow['doc_type']
            # Skip if the row is not a page_response
            # Skip if its an invalid page
            # serp pages are invalid pages
            # because no doc_pos for serp
            if prow['type'] != 'page_response' or IsSerpPage(prow['doc_url']):
                continue

            ptime = prow['time']
            purl = prow['doc_url']
            ppos = -1

            # Find the doc_pos from result entry
            for rindex, rrow in group.iterrows():

                # Get the doc_pos from the result entry
                # whose url matches with the page response url
                if rrow['type'] == 'results' and (purl in rrow['doc_url']):
                    ppos = rrow['doc_pos']

                # Search only those result entries which
                # has timestamp lower than the page response time
                if rrow['time'] > ptime:
                    break

            # if ppos = -1 that means we did not find the match
            # TODO: handle ppos=-1 case
            #prow['doc_pos'] = ppos
            concat_table.set_value(pindex,'doc_pos',ppos)
            concat_table.set_value(pindex,'first_result_type',vert_type)
    
    # Filter rows with page responses. 
    page_responses = concat_table[concat_table['type'] == 'page_response']
    page_responses = page_responses[page_responses['first_result_type'].str.len() == 1]
    first_rel_group = page_responses[page_responses['doc_pos'] ==0].groupby(\
            ['first_result_type', 'response_type'])
    last_rel_group = page_responses[page_responses['doc_pos']>0].groupby(\
            ['first_result_type','response_type'])

    print page_responses[page_responses['doc_pos'] ==0].groupby(\
            ['first_result_type', 'response_type']).agg({
                # Find the mean and std rel and satisfaction.
                'response_value' : {
                    'mean': 'mean',
                    'std-dev' : 'std',
                    'count' : 'count'
                }
            })
    print page_responses[page_responses['doc_pos']>0].groupby(\
            ['first_result_type','response_type']).agg({
                # Find the mean and std rel and satisfaction.
                'response_value' : {
                    'mean': 'mean',
                    'std-dev' : 'std',
                    'count' : 'count'
                }
            })

    verticals = ['i','o','w','v']
    for i in range(len(verticals)):
      v1 = verticals[i]
      for j in range(i, len(verticals)):
        v2 = verticals[j]
        for attribute in ['relevance','satisfaction']:
          print 'Krusk wallis',v1, v2,attribute,'first_rank', \
          kruskalwallis(first_rel_group.get_group((v1,attribute))['response_value'],\
                  first_rel_group.get_group((v2, attribute))['response_value'])
          print 'Krusk wallis', v1, v2, attribute, 'rel_off_rank',\
          kruskalwallis(last_rel_group.get_group((v1,attribute))['response_value'],\
                  last_rel_group.get_group((v2,attribute))['response_value'])

    print 'Man sat_first_rank v-o',\
    pearsonr(first_rel_group.get_group(('v','satisfaction'))['response_value'],\
                  first_rel_group.get_group(('v','relevance'))['response_value'])
    print 'Man sat_first_rank w-o',\
    (first_rel_group.get_group(('w','satisfaction'))['response_value'],\
                  first_rel_group.get_group(('w','relevance'))['response_value'])

    print 'Man sat_first_rank o-o',\
    kendalltau(first_rel_group.get_group(('o','satisfaction'))['response_value'],\
                  first_rel_group.get_group(('o','relevance'))['response_value'])

    # Find the variation in page satisfaction and relevance for 
    # each position per vertical. 
    rank_level_rel_and_sat = page_responses[page_responses['doc_pos']< 3].groupby(\
            ['first_result_type', 'response_type', 'doc_pos']).agg({
                # Find the mean and std rel and satisfaction.
                'response_value' : {
                    'mean': 'mean',
                    'std-dev' : 'std',
                    'count' : 'count'
                }
            })
    rank_level_rel_and_sat.reset_index().to_csv('vert_level_pos_level_rel_and_sat.csv',index='False')

    PlotSatAndRelBoxPlotPerVertical(first_rel_group, 'Page Response',\
        'rel_sat_first_pos.png')
def FindVisiblityMetricsPerVertical(result_table,vis_event_table):
    concat_table = pd.concat([result_table, vis_event_table], ignore_index = True)

    # Initialize visiblity metric
    # Stores #sessions in which 
    # each card was visible
    visibility = {}
    visibility['i'] = np.zeros(10)
    visibility['v'] = np.zeros(10)
    visibility['w'] = np.zeros(10)
    visibility['o'] = np.zeros(10)

    # Initialize time metric
    # Stores the total time 
    # for which each card was visible
    visible_time = {}
    visible_time['i'] ={ 0: [], 1: [] , 2: [], 3: [], 4: [] ,5:[], 6:[], 7:[], 8:[],9:[]} 
    visible_time['v'] ={ 0: [], 1: [] , 2: [], 3: [], 4: [] ,5:[], 6:[], 7:[], 8:[],9:[]} 
    visible_time['w'] ={ 0: [], 1: [] , 2: [], 3: [], 4: [] ,5:[], 6:[], 7:[], 8:[],9:[]} 
    visible_time['o'] ={ 0: [], 1: [] , 2: [], 3: [], 4: [] ,5:[], 6:[], 7:[], 8:[],9:[]}

    grouped_table = concat_table.groupby(['user_id'])
    
    for name, group in grouped_table:
        group = group.sort('time')

        # Top vertical in the session
        top_vert = None

        # time of the previous event in the session
        prev_time = None

        # card visibility for the session
        # 1: visible 0: invisible
        card_vis = np.zeros(10)
    
        # card status and time for the session
        # card_status stores time when it became visible or 0 if its invisible
        # card_time stores time in seconds
        card_status = 10*[None]
        card_time = np.zeros(10)

        # Process sessions for this user
        for index, row in group.iterrows():

            # Row type 'result' indicates the start of a new session
            if row['type'] == 'results':
                
                # Save results of a previous session
                if top_vert != None:
                    visibility[top_vert] = visibility[top_vert] + card_vis
                    card_vis = np.zeros(10)

                    # Compute the time for cards that were visible 
                    # at the end of the previous session
                    for cid in range(0,10):
                        if card_status[cid] != None:
                            time_diff = (row['time']-card_status[cid]).total_seconds()

                            if time_diff < MAX_CARD_DWELL_TIME:
                                card_time[cid] = card_time[cid] + time_diff
                            else:
                                time_diff = (prev_time-card_status[cid]).total_seconds()
                                if time_diff < MAX_CARD_DWELL_TIME:
                                    card_time[cid] = card_time[cid] + time_diff
                                else:
                                    # setting the dwell time to default
                                    card_time[cid] = card_time[cid] + DEFAULT_CARD_DWELL_TIME

                        visible_time[top_vert][cid].append(card_time[cid])

                    card_status = 10*[None]
                    card_time = np.zeros(10)
                    
                top_vert = row['doc_type']

            # Otherwise it is the event row
            # Update stats of the current session
            else:
                card_vis = UpdateCardVisibility(row['event_value'],card_vis)
                card_status, card_time = UpdateCardTime(row['event_value'],card_status,card_time,row['time'])

            prev_time = row['time']

        # Save results of the last session of this user
        if top_vert != None:
            visibility[top_vert] = visibility[top_vert] + card_vis

            # This is the last session of this user
            # so we do not have enough information
            # to compute dwell time for the cards
            # that were visible at the end of the session
            # We simply add default card dwell time
            for cid in range(0,10):
                if card_status[cid] != None:
                    card_time[cid] = card_time[cid] + DEFAULT_CARD_DWELL_TIME
                visible_time[top_vert][cid].append(card_time[cid])


    verticals = visible_time.keys()
    for vertical in verticals:
      print vertical ,visibility[vertical]

    for vertical in verticals:
      print vertical,' '.join([str(round(sum(card_times)/visibility[vertical][0],3))\
                      for card_times in visible_time[vertical].values()])

    for vertical in verticals:
      print vertical,' '.join([str(round(np.median(card_times),3))\
                     for card_times in visible_time[vertical].values()])

    for vertical in verticals:
      print 'Median time',vertical, np.median(visible_time[vertical][0])

    for i in range(len(verticals)):
      v1 = verticals[i]
      for j in range(i, len(verticals)):
        v2 = verticals[j]
        for pos in range(4):
            print 'Man visibilit time', v1, v2, pos,\
            kruskalwallis(visible_time[v1][pos],visible_time[v2][pos])

    PlotMultipleBoxPlotsPerVertical(visible_time, [1,2,3,4,5],'Document Positions',\
                                  'Viewport Time (sec)','','view_port_time.png')
Exemple #44
0
                             100.0, 98.507462686567166, 98.507462686567166, 97.761194029850742, 97.761194029850742],\
                            [90.370370370370367, 97.037037037037038, 94.074074074074076, 95.555555555555557, 91.111111111111114,\
                             95.555555555555557, 97.037037037037038, 97.777777777777771, 95.555555555555557, 94.81481481481481],\
                            [93.814432989690715, 91.75257731958763, 93.814432989690715, 93.814432989690715, 93.814432989690715,\
                             89.69072164948453, 96.907216494845358, 92.783505154639172, 95.876288659793815, 98.969072164948457]\
                        ])


############### Tool's MAX.ACC PER CORPUS #################
#max_accs_array = np.array([ [ 95.26, 95.56, 95.70, 95.18, 95.85, 98.13, 94.88, 94.12 ],\
#                            [ 91.53, 90.84, 89.16, 90.23, 91.03, 91.99, 87.09, 90.81 ],\
#                            [ 91.47, 92.00, 91.73, 92.27, 90.67, 87.43, 88.27, 91.23 ],\
#                            [ 92.00, 93.60, 92.27, 93.33, 92.27, 95.27, 89.33, 92.57 ],\
#                         ])

print spm.kruskalwallis(max_accs_array[0],max_accs_array[1],max_accs_array[2],max_accs_array[3],\
                        max_accs_array[4],max_accs_array[6],max_accs_array[7])
print spm.kruskalwallis(max_accs_array[5],max_accs_array[2])
#print sps.wilcoxon(max_accs_array[1],max_accs_array[4])
print spm.mannwhitneyu(max_accs_array[5],max_accs_array[2]) #, use_continuity=True)
0/0

#print max_accs_array

#import matplotlib.pyplot as plt
#plt.hist(max_accs_array[1], 10) #bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, hold))
#plt.figure()
#plt.plot([1,2,3,4,5,6,7,8,9,10],max_accs_array[7] ) #, histtype='bar', rwidth=0.8)
#plt.show()

print max_accs_array, "\n"
Exemple #45
0
print 'AUC: ' + str(auc)


from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats.mstats import kruskalwallis, friedmanchisquare

Array = ['NL','BPH','HGPIN','G3','G4','G5']
multiarea = magi_area['NL'].append(magi_area['BPH'])
multiarea = multiarea.append(magi_area['HGPIN'])
multiarea = multiarea.append(magi_area['G3'])
multiarea = multiarea.append(magi_area['G4'])
multiarea = multiarea.append(magi_area['G5'])
multiarea = multiarea.dropna()
multilesion = list()
a = 0
while a < 6:
    column = Array[a]
    coldata = magi_area[column]
    coldata = coldata.dropna()
    for deet in coldata:
        multilesion.append(a)
    a = a + 1

print pairwise_tukeyhsd(multiarea, multilesion) 
print kruskalwallis(magi_area['NL'].dropna(), magi_area['BPH'].dropna(), magi_area['HGPIN'].dropna(),
    magi_area['G3'].dropna(), magi_area['G4'].dropna(), magi_area['G5'].dropna())

print kruskalwallis(magi_stain['NL'].dropna(), magi_stain['BPH'].dropna(), magi_stain['HGPIN'].dropna(),
    magi_stain['G3'].dropna(), magi_stain['G4'].dropna(), magi_stain['G5'].dropna())

def ClusterAssociations(Raw, Symbols, Types, Labels, Tau=0.05):
    """
    Examines associations between cluster assigments of samples and copy-number
    and mutation events.

    Parameters
    ----------
    Raw : array_like
    Numpy array containing raw, unnormalized feature values. These are used to
    examine associations between feature values and cluster assignments.
    Features are in columns and samples are in rows.

    Symbols : array_like
    List containing strings describing features. See Notes below for
    restrictions on symbol names.

    Types: array_like
    List containing strings describing feature types (e.g. CNV, Mut, Clinical).
    See notes on allowed values of Types below.

    Labels : array_like
    Cluster labels for the samples in 'Raw'.

    Tau : scalar
    Threshold for statistical significance when examining cluster associations.

    Returns
    -------
    Significant : array_like
    List of copy number and mutation features from 'Raw' that are significantly
    associated with the clustering 'Labels'.

    SigTypes : array_like
    List of types for significant features.

    Notes
    -----
    Types like 'Mut' and 'CNV' that are generated as suffixes to feature names
    by the package tcgaintegrator are required analysis.

    See Also
    --------
    RiskCohort, RiskCluster
    """

    # initialize list of symbols with significant associations and their types
    Significant = []
    SigTypes = []

    # identify mutations and CNVs
    Mutations = [index for index, tpe in enumerate(Types) if tpe == "Mut"]
    CNVs = [index for index, tpe in enumerate(Types) if tpe == "CNV"]

    # test mutation associations
    for i in np.arange(len(Mutations)):

        # build contingency table - expected and observed
        Observed = np.zeros((2, np.max(Labels)))
        for j in np.arange(1, np.max(Labels) + 1):
            Observed[0, j - 1] = np.sum(Raw[Labels == j, Mutations[i]] == 0)
            Observed[1, j - 1] = np.sum(Raw[Labels == j, Mutations[i]] == 1)
        RowSum = np.sum(Observed, axis=0)
        ColSum = np.sum(Observed, axis=1)
        Expected = np.outer(ColSum, RowSum) / np.sum(Observed.flatten())

        # perform test
        stat, p = chisquare(Observed, Expected, ddof=1, axis=None)
        if p < Tau:
            Significant.append(Symbols[Mutations[i]])
            SigTypes.append(Types[Mutations[i]])

    # copy number associations
    for i in np.arange(len(CNVs)):

        # separate out CNV values by cluster and perform test - hack for bad
        # interfact to scipy kruskalwallis
        if (np.max(Labels) == 2):
            CNV1 = Raw[Labels == 1, CNVs[i]]
            CNV2 = Raw[Labels == 2, CNVs[i]]
            stat, p = kruskalwallis(CNV1, CNV2)
        elif (np.max(Labels) == 3):
            CNV1 = Raw[Labels == 1, CNVs[i]]
            CNV2 = Raw[Labels == 2, CNVs[i]]
            CNV3 = Raw[Labels == 3, CNVs[i]]
            stat, p = kruskalwallis(CNV1, CNV2, CNV3)
        elif (np.max(Labels) == 4):
            CNV1 = Raw[Labels == 1, CNVs[i]]
            CNV2 = Raw[Labels == 2, CNVs[i]]
            CNV3 = Raw[Labels == 3, CNVs[i]]
            CNV4 = Raw[Labels == 4, CNVs[i]]
            stat, p = kruskalwallis(CNV1, CNV2, CNV3, CNV4)
        elif (np.max(Labels) == 5):
            CNV1 = Raw[Labels == 1, CNVs[i]]
            CNV2 = Raw[Labels == 2, CNVs[i]]
            CNV3 = Raw[Labels == 3, CNVs[i]]
            CNV4 = Raw[Labels == 4, CNVs[i]]
            CNV5 = Raw[Labels == 5, CNVs[i]]
            stat, p = kruskalwallis(CNV1, CNV2, CNV3, CNV4, CNV5)
        if p < Tau:
            Significant.append(Symbols[CNVs[i]])
            SigTypes.append(Types[CNVs[i]])

    # return names of features with significant associations
    return Significant, SigTypes
Exemple #47
0
# output = open(args["index"], "w")

kw = []

# loop over images
# for imagePath in glob.glob(args["dataset"] + "/*.jpg"):
for imagePath in glob.glob(dataset + "/*.jpg"):

	imageID = imagePath[imagePath.rfind("/") + 1:]
	image = cv2.imread(imagePath)

	# describe image
	features = cd.describe(image)
	

	temp = kruskalwallis(features)
	kw.append((imageID, temp[0], temp[1]))

	# kw = [str(f) for f in kw]
	# output.write("%s, %s\n" % (imageID, ",".join(kw)))	
	# features = [str(f) for f in features]
	# output.write("%s, %s\n" % (imageID, ",".join(features)))

# output = open(args["index"], "w")

# # order by 
# kw = sorted(kw, key=lambda l:l[1])

# with open(args["index"], 'wb') as f:
# 	wr = csv.writer(f)
# 	wr.writerows(kw)
def ComputePreClickDistributions(merged_table):
  # Compute before each click the scatter plot of
  # time before click and max rank. 
  first_click_time_and_pos = {'i':[], 'v':[], 'w':[], 'o':[]}

  grouped_table = merged_table.groupby(['user_id','task_id'])
  for name, group in grouped_table:
      group = group.sort('time')
      first_click_pos = None
      first_click_time = None
      first_result_type = None
      first_event_time = None
      last_result_before_click = None
      for index, row in group.iterrows():
        # Get the time and max visible result pos for each
        # vertical.
        if row['type'] == 'results':
          if first_click_time and last_result_before_click > -1 \
              and first_click_pos > -1 and first_result_type:
            first_click_time_and_pos[first_result_type].append(\
                (first_click_time, first_click_pos, last_result_before_click))
          
          first_click_pos = None
          first_click_time = None
          first_event_time = None
          first_result_type = None
          last_result_before_click = None
          
          first_result_type = row['doc_type']

        if row['type'] == 'event' and (first_click_pos == None):
          # Record time of first event.
          if not first_event_time:
            first_event_time = row['time']
          # Find the maximum result visible. 
          if len(row['visible_elements']) > 0:
            for entry in row['visible_elements'].split():
              last_result_before_click = max(last_result_before_click,\
                                        int(entry[entry.rfind('_')+1:]))
        
          event_type = row['event_type']
          if 'tap' in event_type and (first_click_pos == None):
            # set the time. 
            first_click_time = (row['time'] - first_event_time).total_seconds()
            first_click_pos = int(row['element'])
            
        if (row['type'] == 'click') and (first_click_pos == None):
          first_click_time = (row['time'] - first_event_time).total_seconds()
          first_click_pos = int(row['doc_id'][row['doc_id'].rfind('_')+1:])

      if first_click_time and last_result_before_click > -1 \
          and first_click_pos > -1 and first_result_type:
        first_click_time_and_pos[first_result_type].append(\
            (first_click_time, first_click_pos, last_result_before_click))

  # Scatter Plots did not lead to any information or significant difference 
  # between different verticals.

  # Plot the following scatter plots:
  # 1. Time to rank viewed. 
  scatter1 = {}
  last_viewed_result = {}
  for result_type, time_and_pos_array in first_click_time_and_pos.items():
    # format vert_type : {pos : [time list (sec)]}
    scatter1[result_type] ={}
    last_viewed_result[result_type] =[]
    # Sort by last viewed result (the format is time, pos, last_viewd_rank)
    sorted_tuple_by_view_rank = sorted(time_and_pos_array, key = lambda x : x[2])
    for sorted_tuple in sorted_tuple_by_view_rank:
      click_rank = sorted_tuple[1] +1
      view_rank = sorted_tuple[2] +1
      last_viewed_result[result_type].append(view_rank)
      if click_rank not in scatter1[result_type]:
        scatter1[result_type][click_rank] = []
      # Time should be less than 100 seconds
      scatter1[result_type][click_rank].append(sorted_tuple[0])
        
  for vert, dictionary in scatter1.items():
      print 'krusk walllis ', vert, kruskalwallis(last_viewed_result[vert],last_viewed_result['o'])
      for rank, array in dictionary.items():
          if rank in scatter1['o']:
              print 'krusk walllis ', vert,rank, kruskalwallis(array,scatter1['o'][rank])

  
  PlotVerticalLevelAttributeBoxPlot(last_viewed_result,'', 11, ['Lowest Examined Snippet'],\
      'Snippet Rank', '', 'last_viewed_snippet.png')
            # Determine the settings from the filename
            problem, dup, ordering, nodes, mut, seed = base.split('_')
            with open_file_method(filename)(filename, 'r') as f:
                data = json.load(f)
            version = dup, ordering, nodes, mut
            if (dup, ordering) == ('skip', 'normal'):
                control_group = version
            statify[version].append(data[1]['evals'])
            active[version].append(data[1]['phenotype'])
            best = data[1]['bests'][-1]
            test = data[1]['test_inputs']
            individual = Individual.reconstruct_individual(best, test)
            simplified = individual.new(Individual.simplify)
            reduced[version].append(len(simplified.active))
            filecount += 1
        except ValueError:
            print filename, "FAILED"

    # Kruskal's requires a rectangular matrix
    rect = make_rectangular(statify.values(), 10000001)

    print 'Files Successfully Loaded', filecount
    print 'Kruskal Wallis', kruskalwallis(rect)
    for version, data in statify.iteritems():
        print '--------- %s ---------' % str(version)
        print "MES, MAD", median_deviation(data)
        print 'Active', median_deviation(active[version])
        print 'Reduced', median_deviation(reduced[version])
        print 'Mann Whitney U against Control',
        print mannwhitneyu(statify[control_group], data)
def uw_tier_histplots():
    sample['Underwriter Tier'] = sample['lead_underwriter_tier']
    sample['IPO Duration'] = sample['IPO_duration']
    ranks = ["-1", "0+", "7+", "9"]

    def uw_tier_duration(x):
        return sample[sample.lead_underwriter_tier==x]['IPO_duration']
    kwstat = kruskalwallis(*[uw_tier_duration(x) for x in ranks])

    # g = sb.FacetGrid(sample,
    #                 row="Underwriter Tier",
    #                 hue="Underwriter Tier",
    #                 palette=cp_four("cool_r"),
    #                 size=2, aspect=4,
    #                 hue_order=ranks, row_order=ranks,
    #                 legend=ranks, xlim=(0,1095))
    # g.map(sb.distplot, "IPO Duration")
    # plt.savefig("IPO_tiers_KP_survival.pdf", format='pdf', dpi=200)


    from lifelines.estimation import KaplanMeierFitter
    from lifelines.statistics import logrank_test
    import matplotlib.pyplot as plt

    ranks = ["-1", "0+", "7+", "9"]
    ranklabels = ['No Underwriter', 'Low Rank', 'Mid Rank', 'Rank 9 (elite)']
    kmf = KaplanMeierFitter()

    # Success
    f, ax = plt.subplots(1,1,figsize=(12, 4), sharex=True)
    T = 1 # annotation line thickness

    for rank, rlabel, color in zip(ranks, ranklabels, cp_four("cool_r")):
        uw = sample[sample.lead_underwriter_tier==rank]

        kmf.fit(uw['IPO_duration'],
                label='{} N={}'.format(rlabel, len(uw)),
                alpha=0.9)
        kmf.plot(ax=ax, c=color, alpha=0.7)

        quartiles = [int(np.percentile(kmf.durations, x)) for x in [25, 50, 75]][::-1]
        aprops = dict(facecolor=color, width=T, headwidth=T)

        if rank=="-1":
            plt.annotate("75%: {} days".format(quartiles[0]),
                        (quartiles[0], 0.25),
                        xytext=(quartiles[0]+145, 0.25+.04),
                        arrowprops=aprops)

            plt.annotate("50%: {} days".format(quartiles[1]),
                        (quartiles[1], 0.50),
                        xytext=(quartiles[1]+145, 0.50+.04),
                        arrowprops=aprops)

            plt.annotate("25%: {} days".format(quartiles[2]),
                        (quartiles[2], 0.75),
                        xytext=(quartiles[2]+145, 0.75+0.04),
                        arrowprops=aprops)
        elif rank=="9":
            plt.annotate("75%: {} days".format(quartiles[0]),
                        (quartiles[0], 0.25),
                        xytext=(quartiles[0]+415, 0.25+.1),
                        arrowprops=aprops)

            plt.annotate("50%: {} days".format(quartiles[1]),
                        (quartiles[1], 0.50),
                        xytext=(quartiles[1]+290, 0.50+.1),
                        arrowprops=aprops)

            plt.annotate("25%: {} days".format(quartiles[2]),
                        (quartiles[2], 0.75),
                        xytext=(quartiles[2]+165, 0.75+0.1),
                        arrowprops=aprops)

    plt.annotate("Kruskall Wallis\nH: {:.3f}\nprob: {:.3f}".format(*kwstat),
                (960, 0.1))
    plt.ylim(0,1)
    plt.xlim(0,1095)
    plt.title("Kaplan-Meier survival times by bank tier")
    plt.xlabel("IPO Duration (days)")
    plt.ylabel(r"$S(t)=Pr(T>t)$")
    plt.savefig("IPO_tiers_KP_survival.pdf", format='pdf', dpi=200)
Exemple #51
0
#Plot the data:
my_colors = 'rgbkymc'  #red, green, blue, black, yellow, purple, cyan etc.

s.plot(
    kind='bar',
    color=my_colors,
)

plt.show()



from scipy.stats import mstats


Col_1 = [1,2,3]
Col_2 = [1000,20000,1000]
Col_3 = [100,203,109]
Col_4 = [1,3,5]

print("Kruskal Wallis H-test test:")

H, pval = mstats.kruskalwallis(Col_1, Col_2, Col_3, Col_4)

print("H-statistic:", H)
print("P-Value:", pval)

if pval < 0.05:
    print("Reject NULL hypothesis - Significant differences exist between groups.")
if pval > 0.05:
    print("Accept NULL hypothesis - No significant difference between groups.")
Exemple #52
0
'''Example of a Kruskal-Wallis test (for not normally distributed data)

'''

'''
Author: Thomas Haslwanter
Date:   March-2013
Ver:    1.0
'''

from scipy.stats.mstats import kruskalwallis
from numpy import array

# And finally, give an example of the Kruskal-Wallis test
# Taken from http://www.brightstat.com/index.php?option=com_content&task=view&id=41&Itemid=1&limit=1&limitstart=2

# Get the data
city1 = array([68, 93, 123, 83, 108, 122])
city2 = array([119, 116, 101, 103, 113, 84])
city3 = array([70, 68, 54, 73, 81, 68])
city4 = array([61, 54, 59, 67, 59, 70])

# Perform the Kruskal-Wallis test
h, p = kruskalwallis(city1, city2, city3, city4)

# Print the results
if p<0.05:
    print('There is a significant difference between the cities.')
else:
    print('No significant difference between the cities.')
Exemple #53
0
stepsize = arg_map["stepsize"]
log("Computing Kruskal-Wallis test on windows of size %d and step %d" % (windowsize, stepsize))
testresults = []
for window in BSWindowGen(positions, windowsize, stepsize):

    covcheck = lambda c: c >= arg_map["mincov"]
    filt_pos = filter(lambda position: all(map(covcheck, map(baseCoverage, position.samples))), window.positions)

    if not len(filt_pos) >= arg_map["minwinsites"]:
        continue

    pos_by_sample = zip(*map(attrgetter("samples"), filt_pos))
    methyl_by_sample = map(lambda bases: map(methValue, bases), pos_by_sample)

    try:
        (h, p) = kruskalwallis(*methyl_by_sample)
    except Exception as e:
        sys.stderr.write("Error: %s \n" % e)
        continue

    testresults.append(KWResult(p, h, window))

log("Sorting Results")
testresults.sort()

log("Writing Output")
m = float(len(testresults))
q = arg_map["q"]
k = 0

for result in testresults:
Exemple #54
0
def KWtest(Matrixs, Words, WordLists, option='CustomP', Low=0.0, High=1.0):
    """
    give the kruskal wallis test result on the topword
    :param Matrixs: every element is a group Matrix that contain the word counts, each represent a segement.
    :param Words: all the words (Matrixs and words are parallel)
    :param WordLists: a list of dictionary that has the word map to its word count.
                        each dictionary represent the information inside a segment
    :param option: some default option to set for High And Low(see the document for High and Low)
                    1. using standard deviation to find outlier
                        TopStdE: only analyze the Right outlier of word, determined by standard deviation
                                    (word frequency > average + 2 * Standard_Deviation)
                        MidStdE: only analyze the Non-Outlier of word, determined by standard deviation
                                    (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation)
                        LowStdE: only analyze the Left Outlier of word, determined by standard deviation
                                    (average - 2 * Standard_Deviation > word frequency)

                    2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED*
                        TopIQR: only analyze the Top outlier of word, determined by IQR
                                    (word frequency > median + 1.5 * Standard)
                        MidIQR: only analyze the non-outlier of word, determined by IQR
                                    (median + 1.5 * Standard > word frequency > median - 1.5 * Standard)
                        LowIQR: only analyze the Left outlier of word, determined by IQR
                                    (median - 1.5 * Standard > word frequency)
    :param Low: this method will only analyze the word with higher frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')
    :param High: this method will only analyze the word with lower frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')

    :return:
          a sorted dict (list of tuples) that the first element of the word and the second element is it corresponding p value
    """
    # begin handle options
    MergeList = merge_list(WordLists)
    TotalWordCount = sum(MergeList.values())
    NumWord = len(MergeList)

    High, Low = wordfilter(option, Low, High, NumWord, TotalWordCount, MergeList)
    # end handle options

    Len = max(len(matrix) for matrix in Matrixs)
    # the length of all the sample set (all the sample set with less that this will turn into a masked array)

    word_pvalue_dict = {}  # the result list

    for i in range(1, len(Matrixs[0][0])):  # focusing on a specific word
        word = Words[i - 1]
        try:
            MergeList[word]
        except KeyError:
            continue
        if Low < MergeList[word] < High:
            samples = []
            for k in range(len(Matrixs)):  # focusing on a group
                sample = []
                for j in range(len(Matrixs[k])):  # focusing on all the segment of that group
                    # add the sample into the sample list
                    sample.append(Matrixs[k][j][i])

                # combine all the samples of each sample list
                # turn the short ones masked so that all the sample set has the same length
                samples.append(ma.masked_array(sample + [0] * (Len - len(sample)),
                                               mask=[0] * len(sample) + [1] * (Len - len(sample))))

            # do the KW test
            try:
                pvalue = kruskalwallis(samples)[1]
            except ValueError as error:
                if error.args[0] == 'All numbers are identical in kruskal':  # get the argument of the error
                    pvalue = 'Invalid'
                else:
                    raise ValueError(error)

            # put the result in the dict
            word_pvalue_dict.update({word.decode('utf-8'): pvalue})
    return sorted(word_pvalue_dict.items(), key=itemgetter(1))
Exemple #55
0
# open output
# output = open(args["index"], "w")

kw = []

# loop over images
# for imagePath in glob.glob(args["dataset"] + "/*.jpg"):
for imagePath in glob.glob(dataset + "/*.jpg"):

    imageID = imagePath[imagePath.rfind("/") + 1:]
    image = cv2.imread(imagePath)

    # describe image
    features = cd.describe(image)

    temp = kruskalwallis(features)
    kw.append((imageID, temp[0], temp[1]))

    # kw = [str(f) for f in kw]
    # output.write("%s, %s\n" % (imageID, ",".join(kw)))
    # features = [str(f) for f in features]
    # output.write("%s, %s\n" % (imageID, ",".join(features)))

# output = open(args["index"], "w")

# # order by
# kw = sorted(kw, key=lambda l:l[1])

# with open(args["index"], 'wb') as f:
# 	wr = csv.writer(f)
# 	wr.writerows(kw)
Exemple #56
0
def KWtest(Matrixs, Words, WordLists, option='CustomP', Low=0.0, High=1.0):
    """
    give the kruskal wallis test result on the topword
    :param Matrixs: every element is a group Matrix that contain the word counts, each represent a segement.
    :param Words: all the words (Matrixs and words are parallel)
    :param WordLists: a list of dictionary that has the word map to its word count.
                        each dictionary represent the information inside a segment
    :param option: some default option to set for High And Low(see the document for High and Low)
                    1. using standard deviation to find outlier
                        TopStdE: only analyze the Right outlier of word, determined by standard deviation
                                    (word frequency > average + 2 * Standard_Deviation)
                        MidStdE: only analyze the Non-Outlier of word, determined by standard deviation
                                    (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation)
                        LowStdE: only analyze the Left Outlier of word, determined by standard deviation
                                    (average - 2 * Standard_Deviation > word frequency)

                    2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED*
                        TopIQR: only analyze the Top outlier of word, determined by IQR
                                    (word frequency > median + 1.5 * Standard)
                        MidIQR: only analyze the non-outlier of word, determined by IQR
                                    (median + 1.5 * Standard > word frequency > median - 1.5 * Standard)
                        LowIQR: only analyze the Left outlier of word, determined by IQR
                                    (median - 1.5 * Standard > word frequency)
    :param Low: this method will only analyze the word with higher frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')
    :param High: this method will only analyze the word with lower frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')

    :return:
          a sorted dict (list of tuples) that the first element of the word and the second element is it corresponding p value
    """
    # begin handle options
    MergeList = merge_list(WordLists)
    TotalWordCount = sum(MergeList.values())
    NumWord = len(MergeList)

    High, Low = wordfilter(option, Low, High, NumWord, TotalWordCount,
                           MergeList)
    # end handle options

    Len = max(len(matrix) for matrix in Matrixs)
    # the length of all the sample set (all the sample set with less that this will turn into a masked array)

    word_pvalue_dict = {}  # the result list

    for i in range(1, len(Matrixs[0][0])):  # focusing on a specific word
        word = Words[i - 1]
        try:
            MergeList[word]
        except KeyError:
            continue
        if Low < MergeList[word] < High:
            samples = []
            for k in range(len(Matrixs)):  # focusing on a group
                sample = []
                for j in range(len(Matrixs[k])
                               ):  # focusing on all the segment of that group
                    # add the sample into the sample list
                    sample.append(Matrixs[k][j][i])

                # combine all the samples of each sample list
                # turn the short ones masked so that all the sample set has the same length
                samples.append(
                    ma.masked_array(sample + [0] * (Len - len(sample)),
                                    mask=[0] * len(sample) + [1] *
                                    (Len - len(sample))))

            # do the KW test
            try:
                pvalue = kruskalwallis(samples)[1]
            except ValueError as error:
                if error.args[
                        0] == 'All numbers are identical in kruskal':  # get the argument of the error
                    pvalue = 'Invalid'
                else:
                    raise ValueError(error)

            # put the result in the dict
            word_pvalue_dict.update({word.decode('utf-8'): pvalue})
    return sorted(word_pvalue_dict.items(), key=itemgetter(1))
    def classifiers(self):
        # Get performance data.  A less busy person would put this in its own function.
        if not self.perfsCalculated:
            self.calculatePerformances()

        accels = map(lambda x: x["averageAccel"], self.novicePerfs + self.intermediatePerfs + self.expertPerfs)
        durations = map(lambda x: x["duration"], self.novicePerfs + self.intermediatePerfs + self.expertPerfs)
        smooths = map(lambda x: x["motionSmoothness"], self.novicePerfs + self.intermediatePerfs + self.expertPerfs)
        distances = map(lambda x: x["angularDist"], self.novicePerfs + self.intermediatePerfs + self.expertPerfs)
        ambidexterities = map(
            lambda x: x["ambidextricity"][0], self.novicePerfs + self.intermediatePerfs + self.expertPerfs
        )

        # Make feature and target vectors
        X = np.array(zip(accels, durations, smooths, distances, ambidexterities))
        y = np.array(
            [0] * 10 + [0.25, 1.0 / 3, 1.0 / 3, 0.5, 0.5, 0.5, 2, 2, 2, 2] + [6, 6, 6, 8, 8, 8, 10, 10, 10, 6]
        )  # Experience levels in years
        # y = np.array([0]*10 + [1]*10 + [2]*10)

        """
		paired = zip(X,y)
		shuffle(paired)
		X,y = map(np.array, zip(*paired))
		"""

        def resolveClass(y):
            if y <= 0.1:
                return "r"
            if y <= 2:
                return "g"
            else:
                return "b"

                # Cross validation

        all_colours = []
        all_regressions = []
        indexes = []
        skf = cross_validation.StratifiedKFold(map(resolveClass, y), n_folds=10)
        for train_index, test_index in skf:
            indexes.extend(test_index)

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            linearRegressor = LinearRegression(normalize=True).fit(X_train, y_train)
            regressions = map(linearRegressor.decision_function, X_test)
            class_colours = map(resolveClass, y_test)

            all_regressions.extend(regressions)
            all_colours.extend(class_colours)

            print linearRegressor.intercept_, linearRegressor.coef_
            """
			# Plot
			classes = zip(regressions, class_colours)
			plt.figure()
			plt.title("Least Squares Fitted Performace")
			plt.xlabel("Trials")
			plt.ylabel("Score")
			zippedAndSorted = sorted(zip(regressions,class_colours))
			unzipped = zip(*zippedAndSorted)
			plt.scatter(range(len(unzipped[0])), unzipped[0], c=unzipped[1], s=60)
			nov = plt.scatter([], [], color='r')
			inter = plt.scatter([], [], color='g')
			exp = plt.scatter([], [], color='b')
			plt.legend((nov,inter,exp),['Novice','Intermediate','Expert'], loc=2)
			plt.xticks( [] )
			#plt.gca().set_xlim(-1,15)
			#plt.gca().set_ylim(-4,6)
			plt.tight_layout()
			plt.show()
			"""

            # Make test and training set
            # X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.5)
            # Fit linear regressor to weights
            # linearFit = LinearRegression().fit(X_train,y_train)
            # regressions = map( lambda x: linearFit.intercept_ + sum(np.array(x) * linearFit.coef_), X_test)

            # colours = map( resolveClass, y_test )

        classes = zip(all_regressions, all_colours)

        anova_perfs = kruskalwallis(
            [t[0] for t in classes if t[1] == "r"],
            [t[0] for t in classes if t[1] == "g"],
            [t[0] for t in classes if t[1] == "b"],
        )

        fig = plt.figure()
        ax = fig.add_subplot(111)
        plt.title("Linear Performance Score")
        plt.xlabel("Trials")
        plt.ylabel("Score")

        # Sort by value
        all_regressions, all_colours = zip(*sorted(zip(all_regressions, all_colours)))

        # Plot each experience level with different markers and colours
        nov_data = [(i, d) for (i, d, c) in zip(range(len(all_regressions)), all_regressions, all_colours) if c == "r"]
        inter_data = [
            (i, d) for (i, d, c) in zip(range(len(all_regressions)), all_regressions, all_colours) if c == "g"
        ]
        exp_data = [(i, d) for (i, d, c) in zip(range(len(all_regressions)), all_regressions, all_colours) if c == "b"]
        nov = ax.scatter(zip(*nov_data)[0], zip(*nov_data)[1], color="r", marker="o", s=60)
        inter = ax.scatter(zip(*inter_data)[0], zip(*inter_data)[1], color="g", marker="^", s=60)
        exp = ax.scatter(zip(*exp_data)[0], zip(*exp_data)[1], color="b", marker="*", s=60)

        plt.legend((nov, inter, exp), ["Novice", "Intermediate", "Expert"], loc=2)
        plt.xticks([])
        # plt.gca().set_xlim(-1,15)
        # plt.gca().set_ylim(-4,6)
        plt.tight_layout()
        with open("/Users/robertevans/repos/minf/keyhole_graphs/linFit.png", "w") as figOut:
            plt.savefig(figOut)

        plt.figure()
        plt.title("Kruskal-Wallis p-value: {0:.3g}".format(anova_perfs[1]))
        plt.ylabel("Score")
        plt.boxplot(
            [
                [t[0] for t in classes if t[1] == "r"],
                [t[0] for t in classes if t[1] == "g"],
                [t[0] for t in classes if t[1] == "b"],
            ]
        )
        plt.xticks(range(1, 4), ("Novices", "Intermediates", "Experts"))
        # plt.gca().set_ylim(0,9)
        plt.tight_layout()
        with open("/Users/robertevans/repos/minf/keyhole_graphs/linFit_box.png", "w") as figOut:
            plt.savefig(figOut)

        plt.show()
Exemple #58
0
            # Determine the settings from the filename
            problem, dup, ordering, nodes, mut, seed = base.split('_')
            with open_file_method(filename)(filename, 'r') as f:
                data = json.load(f)
            version = dup, ordering, nodes, mut
            if (dup, ordering) == ('skip', 'normal'):
                control_group = version
            statify[version].append(data[1]['evals'])
            active[version].append(data[1]['phenotype'])
            best = data[1]['bests'][-1]
            test = data[1]['test_inputs']
            individual = Individual.reconstruct_individual(best, test)
            simplified = individual.new(Individual.simplify)
            reduced[version].append(len(simplified.active))
            filecount += 1
        except ValueError:
            print(filename, "FAILED")

    # Kruskal's requires a rectangular matrix
    rect = make_rectangular(list(statify.values()), 10000001)

    print('Files Successfully Loaded', filecount)
    print('Kruskal Wallis', kruskalwallis(rect))
    for version, data in statify.items():
        print('--------- %s ---------' % str(version))
        print("MES, MAD", median_deviation(data))
        print('Active', median_deviation(active[version]))
        print('Reduced', median_deviation(reduced[version]))
        print('Mann Whitney U against Control', end=' ')
        print(mannwhitneyu(statify[control_group], data))
def kruskal_wallis(reduced_dataframe, populations_prefix='',
                   populations=['AFR', 'AMR', 'EAS', 'SAS', 'NFE', 'FIN']):
    """Calculate H-statsistic and p-value using Kruskal-Wallis test (non-parametric ANOVA)."""
    populations_data = [reduced_dataframe[populations_prefix + p] for p in populations]
    return kruskalwallis(populations_data)
def plot_statistics_pair ( mydf , feature2_name, name1 , name2, nsamples ) :  

    if ( (feature2_name == 'Gene Expression') or (feature2_name == 'Somatic Copy Number') or  (feature2_name == 'Clinical Numeric') or (feature2_name == 'MicroRNA Expression') ): 
         
         label1 = name1.strip() + " (gene expression)"
         label2 = name2.strip() + " (" +  feature2_name + ")" 
         
         new_df = pd.DataFrame()
         new_df[label1] = pd.to_numeric( mydf['data1'] , errors='coerce') 
         new_df[label2] = pd.to_numeric( mydf['data2'] , errors='coerce') 
  
         new_df.dropna(axis = 0, how ='any', inplace = True)
         
         new_df.plot.scatter(x=label1, y=label2) 
         print(  stats.spearmanr(new_df[ label1],new_df[label2])  )

        
    elif (feature2_name == 'Somatic Mutation t-test' ): 
         label1 = name1.strip() + " (gene expression)"
         label2 = name2.strip() + " (Somatic Mutation)"
       
         mydf.rename(columns={ "data1": label1, "data2": label2 }, inplace=True)
        
         sns.violinplot( x=mydf[label2], y=mydf[label1], palette="Pastel1")
 
         print( mydf.groupby(label2).agg(['mean', 'count']) )
        
         Set1 = mydf[mydf[label2]==0]
         Set2 = mydf[mydf[label2]==1]
        
         print('\nT-test statistics : ')
         print( stats.ttest_ind(Set1[label1], Set2[label1], equal_var=False ) )
        
    elif (feature2_name == 'Somatic Mutation' ): 
         label1 = name1.strip() + " (gene expression)"
         label2 = name2.strip() + " (Somatic Mutation)"
       
         newdf = mydf.rename(columns={ "data1": label1, "data2": label2 })
        
         sns.violinplot( x=newdf[label2], y=newdf[label1], palette="Pastel1")
            
         # rank data 
          
         print( newdf.groupby(label2).agg(['mean', 'count']) )
        
         #Set1 = mydf[mydf[label2]==0]
         #Set2 = mydf[mydf[label2]==1]
        
         print('\nSpearman correlation : ')
         newdf['rnkdata']  = newdf[label1].rank(method='average') #average, min
         print( stats.pearsonr( newdf['rnkdata'] , newdf[label2] ) )  
        

    elif (feature2_name == 'Clinical Categorical' ) :
         new_data = mydf[ mydf.data2.str.contains('^\[.*\]$',na=True,regex=True) == False ]
         label1 = name1.strip() + " (gene expression)"
         label2 = name2.strip() + " (clinical)"
         new_data.rename(columns={ "data1": label1, "data2": label2 }, inplace=True)
        
         sns.violinplot( x=new_data[label2], y=new_data[label1], palette="Pastel1")
        
         print( new_data.groupby( label2 ).agg(['median', 'count']) )
        
         CategoryData = []
         CategoryNames = [] 
         for name, group in new_data.groupby( label2 ) :
             data =  group[ label1 ].values 
             if ( len( data ) > nsamples ) :
                  CategoryData.append( data )
                  CategoryNames.append( name )
                
         print('\nKruskal-Wallis test for groups with more than '+ str(nsamples) +' patients : ')        
         if len( CategoryData ) > 1 :
            print( mstats.kruskalwallis( *[ mydata for mydata in CategoryData   ] ) )
         else :
            print( 'Number of groups less than 2 \n')
        
        
            
    return