Example #1
0
    def _detect_hotspots(self):
        """
        Sets a flag if a node is overloaded or underloaded. Forecasts are used
        to detect hotspots.
        """
        # Here we find out whether detect hotspot is being called from its subclass or not
        # if not, we will just use the default hosts list. Otherwise, we will use the hosts list
        # without the reserve node
        node_list = []
        if hasattr(self, "node_list"):
            node_list = self.node_list
        else:
            node_list = self.model.get_hosts(types.NODE)

        for node in node_list:
            cpu_loads = node.get_readings()
            cpu_loads = cpu_loads[-K_VALUE:]
            node.underloaded = False
            node.overloaded = False

            _, p_value = stats.ttest_1samp(cpu_loads, THRESHOLD_OVERLOAD)
            current_mean = np.mean(cpu_loads, dtype=int)
            if p_value < ALPHA and current_mean >= THRESHOLD_OVERLOAD:
                node.overloaded = True
            else:
                _, p_value = stats.ttest_1samp(cpu_loads, THRESHOLD_UNDERLOAD)
                if p_value < ALPHA and current_mean <= THRESHOLD_UNDERLOAD:
                    node.underloaded = True
Example #2
0
def output_crosstrain(specieslist, numEls = 1000, numtries = 1000):
    '''Cross-train all species in the list with all species in the list. Record mean values, and p-values against null hypothesis loss=1 
        (no performance difference between training and cross-training)
    '''
    ctf1s = np.zeros((len(specieslist), len(specieslist)))
    ctf1s_pv = np.zeros((len(specieslist), len(specieslist)))
    
    ctars = np.zeros((len(specieslist), len(specieslist)))
    ctars_pv = np.zeros((len(specieslist), len(specieslist)))
    
    for s1 in range(len(specieslist)):
        for s2 in range(len(specieslist)): 
            print('Training with ' + specieslist[s1] + ', testing with ' + specieslist[s2])
            losses = cross_train(specieslist[s1], specieslist[s2], numEls, numtries)
            ctf1s[s1,s2] = np.mean(losses['f1'])
            ctf1s_pv[s1, s2] = ttest_1samp(losses['f1'], 1)[1]

            ctars[s1, s2] = np.mean(losses['area'])
            ctars_pv[s1, s2] = ttest_1samp(losses['area'], 1)[1]
    
    np.savetxt(crosstrain_f1_file, ctf1s, delimiter='\t')
    np.savetxt(crosstrain_f1_pv_file, ctf1s_pv, delimiter='\t')
    
    np.savetxt(crosstrain_roc_file, ctars, delimiter='\t')
    np.savetxt(crosstrain_roc_pv_file, ctars_pv, delimiter='\t')
Example #3
0
    def plot(self, signif=0.1):

        if not(hasattr(self, 'compos')):
            self.composite()

        if isinstance(self.compos, pd.core.frame.DataFrame):

            l = len(self.compos.columns)
            f, axes = plt.subplots(nrows=1, ncols=l, figsize=(l,6), sharey=True)
            f.subplots_adjust(wspace=0.0, left=0.15, bottom=0.05, top=0.87)
            axes = axes.flatten('F')
            for i, k in enumerate(self.compos.columns):

                df = self.compos[[k]]

                t, pval = ttest_1samp(df.values, 0)

                self._plot_df(df, axes[i], ax_n=i, pval=pval[0], signif=signif)

        elif isinstance(self.compos, dict):

            l = len(self.compos.keys())
            f, axes = plt.subplots(nrows=1, ncols=l, figsize=(l,6), sharey=True)
            f.subplots_adjust(wspace=0.0, left=0.15, bottom=0.05, top=0.87)
            axes = axes.flatten('F')
            for i, k in enumerate(self.compos.keys()):

                df = self.compos[k]

                t, pval = ttest_1samp(df.values, 0)

                self._plot_df(df, axes[i], ax_n=i, pval=pval[0], signif=signif)

        return f
    def test_weightstats_2(self):
        x1, x2 = self.x1, self.x2
        w1, w2 = self.w1, self.w2

        d1 = DescrStatsW(x1)
        d1w = DescrStatsW(x1, weights=w1)
        d2w = DescrStatsW(x2, weights=w2)
        x1r = d1w.asrepeats()
        x2r = d2w.asrepeats()
#        print 'random weights'
#        print ttest_ind(x1, x2, weights=(w1, w2))
#        print stats.ttest_ind(x1r, x2r)
        assert_almost_equal(ttest_ind(x1, x2, weights=(w1, w2))[:2],
                            stats.ttest_ind(x1r, x2r), 14)
        # not the same as new version with random weights/replication
#        assert x1r.shape[0] == d1w.sum_weights
#        assert x2r.shape[0] == d2w.sum_weights

        assert_almost_equal(x2r.mean(0), d2w.mean, 14)
        assert_almost_equal(x2r.var(), d2w.var, 14)
        assert_almost_equal(x2r.std(), d2w.std, 14)
        # note: the following is for 1d
        assert_almost_equal(np.cov(x2r, bias=1), d2w.cov, 14)
        # assert_almost_equal(np.corrcoef(np.x2r), d2w.corrcoef, 19)
        # TODO: exception in corrcoef (scalar case)

        # one-sample tests
#        print d1.ttest_mean(3)
#        print stats.ttest_1samp(x1, 3)
#        print d1w.ttest_mean(3)
#        print stats.ttest_1samp(x1r, 3)
        assert_almost_equal(d1.ttest_mean(3)[:2], stats.ttest_1samp(x1, 3), 11)
        assert_almost_equal(d1w.ttest_mean(3)[:2],
                            stats.ttest_1samp(x1r, 3), 11)
Example #5
0
    def test_weightstats_2(self):
        x1, x2 = self.x1, self.x2
        w1, w2 = self.w1, self.w2

        d1 = DescrStatsW(x1)
        d1w = DescrStatsW(x1, weights=w1)
        d2w = DescrStatsW(x2, weights=w2)
        x1r = d1w.asrepeats()
        x2r = d2w.asrepeats()
#        print 'random weights'
#        print ttest_ind(x1, x2, weights=(w1, w2))
#        print stats.ttest_ind(x1r, x2r)
        assert_almost_equal(ttest_ind(x1, x2, weights=(w1, w2))[:2],
                            stats.ttest_ind(x1r, x2r), 14)
        #not the same as new version with random weights/replication
#        assert x1r.shape[0] == d1w.sum_weights
#        assert x2r.shape[0] == d2w.sum_weights
        assert_almost_equal(x2r.var(), d2w.var, 14)
        assert_almost_equal(x2r.std(), d2w.std, 14)


        #one-sample tests
#        print d1.ttest_mean(3)
#        print stats.ttest_1samp(x1, 3)
#        print d1w.ttest_mean(3)
#        print stats.ttest_1samp(x1r, 3)
        assert_almost_equal(d1.ttest_mean(3)[:2], stats.ttest_1samp(x1, 3), 11)
        assert_almost_equal(d1w.ttest_mean(3)[:2], stats.ttest_1samp(x1r, 3), 11)
Example #6
0
def getDevMod(w,nulls,rep,use_c):
    """
    Get the deviation from random expectation of modularity. Optimized so that
    the null webs are gone through only one time. Retunrs two arrays, one for
    Qr, the other for Qb.
    """
    m = [w.modules.Q,w.modules.N,w.modules.up_modules,w.modules.low_modules]
    Qbsim = []
    Qrsim = []
    wQr = Qr(w.web,m)
    wQb = w.modules.Q
    for c_null in nulls:
        c_mod = findModules(c_null, use_c = use_c)
        Qrsim.append(Qr(c_null, c_mod))
        Qbsim.append(c_mod[0])
    testResB = spp.ttest_1samp(Qbsim, wQb)
    testResR = spp.ttest_1samp(Qrsim, wQr)
    OUT_r = [wQr,testResR[1]]
    OUT_b = [wQb,testResB[1]]
    est_r = gMIC(Qrsim)
    est_b = gMIC(Qbsim)
    for est_par in est_r:
        OUT_r.append(est_par)
    for est_par in est_b:
        OUT_b.append(est_par)
    return [OUT_r,OUT_b]
Example #7
0
def getDevNest(w,list,use_c):
    expect = []
    expect_up = []
    expect_lo = []
    for i in list:
        Nodf = nodf(i,strict=w.nodf_strict,use_c=use_c)
        expect.append(Nodf[0])
        expect_up.append(Nodf[2])
        expect_lo.append(Nodf[1])
    testRes = spp.ttest_1samp(expect, w.nodf)
    testRes_up = spp.ttest_1samp(expect_up, w.nodf_up)
    testRes_lo = spp.ttest_1samp(expect_lo, w.nodf_low)
    OUT = [w.nodf,testRes[1]]
    OUT_up = [w.nodf_up,testRes_up[1]]
    OUT_lo = [w.nodf_low,testRes_lo[1]]
    est = gMIC(expect)
    est_lo = gMIC(expect_lo)
    est_up = gMIC(expect_up)
    for est_par in est:
        OUT.append(est_par)
    for est_par in est_lo:
        OUT_lo.append(est_par)
    for est_par in est_up:
        OUT_up.append(est_par)
    return [OUT, OUT_lo, OUT_up]
Example #8
0
def tTest(list1,list2):
    f1 = len(list1) <= 1
    f2 = len(list2) <= 1
    if not f1 and not f2:# if they both have multiple values, use the 2 sample t test
        return s.ttest_ind(list1,list2)[1]
    if f1 and not f2:# if list 2 has multiple values and list 1 has single value, use 1 sample ttest
        return s.ttest_1samp(list2,list1[0])[1]
    if not f1 and f2:# list 2 has single values, list 1 has multiple values
        return s.ttest_1samp(list1,list2[0])[1]
    def generate_sequence_gene_expression_statistics(self, show_species_charts=True, show_chart=True):
        i = -1
        if self.multiple_networks:
            for nw_ge_file in glob.glob(self.output_silix_nw_exp_data_folder_path + '/*.txt'):
                i += 1
                mapping_data = np.genfromtxt(nw_ge_file, delimiter=',', dtype=str)
                if len(mapping_data) > 0:
                    print 'Network: ', i, mapping_data.shape
                    x = np.array(mapping_data[:, 2], dtype=float)
                    y = np.array(mapping_data[:, 3], dtype=float)
                    ca_stat = ca_pvalue = spike_stat = spike_pvalue = ind_stat = ind_pvalue = 0

                    if not np.all(x == 0):
                        ca_stat, ca_pvalue = stats.ttest_1samp(x[x != 0], 0)
                        spike_stat, spike_pvalue = stats.ttest_1samp(y[y != 0], 0)
                        ind_stat, ind_pvalue = stats.ttest_ind(x[x != 0], y[y != 0], equal_var=False)
                    nw_number = (int)(re.findall(r'\d+', nw_ge_file)[0])
                    nw_statistics = (
                        [nw_number, x[x != 0].mean(), x[x != 0].var(), x[x != 0].std(), y[y != 0].mean(),
                         y[y != 0].var(), y[y != 0].std(), ca_stat, ca_pvalue,
                         spike_stat, spike_pvalue, ind_stat, ind_pvalue])
                    self.network_gene_expressions.append(nw_statistics)
        else:
            mapping_data = np.genfromtxt(self.output_silix_nw_exp_data_folder_path + self.silix_nw_exp_data_filename,
                                         delimiter=',', dtype=str)
            if len(mapping_data) > 0:
                print 'Network: ', mapping_data.shape
                x = np.array(mapping_data[:, 2], dtype=float)
                y = np.array(mapping_data[:, 3], dtype=float)
                ca_stat = ca_pvalue = spike_stat = spike_pvalue = ind_stat = ind_pvalue = 0
                if not np.all(x == 0):
                    ca_stat, ca_pvalue = stats.ttest_1samp(x[x != 0], 0)
                    spike_stat, spike_pvalue = stats.ttest_1samp(y[y != 0], 0)
                    ind_stat, ind_pvalue = stats.ttest_ind(x[x != 0], y[y != 0], equal_var=False)
                nw_statistics = (
                    [0, x[x != 0].mean(), x[x != 0].var(), x[x != 0].std(), y[y != 0].mean(),
                     y[y != 0].var(), y[y != 0].std(), ca_stat, ca_pvalue,
                     spike_stat, spike_pvalue, ind_stat, ind_pvalue])
                self.network_gene_expressions.append(nw_statistics)
        # convert list into array
        self.network_gene_expressions = np.asarray(self.network_gene_expressions)

        # Save network gene expression statistics to csv file
        gene_expression_statistics_file = self.output_silix_nw_exp_data_folder_path + 'gene_expression_statistics.csv'
        with open(gene_expression_statistics_file, 'w') as f_handle:
            f_handle.write(
                'Network, 9mM CA Mean, 9mM CA Var, 9mM CA SD, Spike Mean, Spike Var, Spike SD, 9mM CA ttest-stat, 9mM CA ttest-pvalue, Spike ttest-stat, Spike ttest-pvalue, Ind ttest-stat, Ind ttest-pvalue  \n')
            np.savetxt(f_handle, self.network_gene_expressions, delimiter=',')

        if show_species_charts:
            self.generate_species_wise_gene_expression_statistics()

        if self.multiple_networks and show_chart:
            self.plot_all_nw_gene_expr_stats_chart()
        elif show_chart and not self.multiple_networks:
            self.plot_single_network_gene_expr_stats_chart()
def createRegressionPlots(predictions,performance,coefs,fb_coefs,nfb_coefs,GroupDF,goodsubj,savefig=True):
    f=plt.figure(figsize=(22,12))
    ax1=plt.subplot2grid((2,4),(0,0), colspan=3)
    ax2=plt.subplot2grid((2,4),(0,3))
    ax3=plt.subplot2grid((2,4),(1,0), colspan=2)
    ax4=plt.subplot2grid((2,4),(1,2), colspan=2)

    dmnIdeal=pd.read_csv('/home/jmuraskin/Projects/NFB/analysis/DMN_ideal_2.csv')

    sns.tsplot(data=predictions,time='TR',value='predicted',unit='subj',condition='fb',ax=ax1)
    ax1.plot((dmnIdeal['Wander']-dmnIdeal['Focus'])/3,'k--')
    ax1.set_title('Average Predicted Time Series')

    g=sns.violinplot(data=performance,x='fb',y='R',split=True,bw=.3,inner='quartile',ax=ax2)
    # plt.close(g.fig)

    g=sns.violinplot(data=coefs,x='pe',y='Coef',hue='fb',split=True,bw=.3,inner='quartile',ax=ax3)
    g.plot([-1,len(unique(coefs['pe']))],[0,0],'k--')
    g.set_xlim([-.5,len(unique(coefs['pe']))])
    ylim=g.get_ylim()
    t,p = ttest_1samp(np.array(performance[performance.fb=='FEEDBACK']['R'])-np.array(performance[performance.fb=='NOFEEDBACK']['R']),0)
    ax2.set_title('Mean Subject Time Series Correlations-p=%0.2f' % p)

    t,p = ttest_1samp(np.array(fb_coefs['Coef'].reshape(len(unique(GroupDF[GroupDF.Subject_ID.isin(goodsubj)]['Subject_ID'])),len(unique(coefs['pe'])))),0)
    p05_FB,padj=fdr_correction(p,0.05)
    t,p = ttest_1samp(np.array(nfb_coefs['Coef'].reshape(len(unique(GroupDF[GroupDF.Subject_ID.isin(goodsubj)]['Subject_ID'])),len(unique(coefs['pe'])))),0)
    p05_NFB,padj=fdr_correction(p,0.05)
    for idx,(pFDR_FB,pFDR_NFB) in enumerate(zip(p05_FB,p05_NFB)):
        if pFDR_FB:
            ax3.scatter(idx,ylim[1]-.05,marker='*',s=75)
        if pFDR_NFB:
            ax3.scatter(idx,ylim[0]+.05,marker='*',s=75)


    t,p=ttest_1samp(np.array(fb_coefs['Coef']-nfb_coefs['Coef']).reshape(len(unique(GroupDF[GroupDF.Subject_ID.isin(goodsubj)]['Subject_ID'])),len(unique(coefs['pe']))),0)
    p05,padj=fdr_correction(p,0.05)

    sns.barplot(x=range(len(t)),y=t,ax=ax4,color='Red')
    for idx,pFDR in enumerate(p05):
        if pFDR:
            ax4.scatter(idx,t[idx]+ np.sign(t[idx])*0.2,marker='*',s=75)
    ax4.set_xlim([-0.5,len(unique(coefs['pe']))])
    ax4.set_xlabel('pe')
    ax4.set_ylabel('t-value')
    ax4.set_title('FB vs. nFB PE')

    for ax in [ax1,ax2,ax3,ax4]:
        for item in ([ax.title, ax.xaxis.label, ax.yaxis.label]):
            item.set_fontsize(18)
        for item in (ax.get_xticklabels() + ax.get_yticklabels()):
            item.set_fontsize(12)

    f.tight_layout()
    if savefig:
        f.savefig('%s/RSN_LinearRegPrediction.pdf' % saveFigureLocation,dpi=300)
Example #11
0
def print_summary_wo_outliers(event_list, attribute, lag, length,
                              outlier_abs_threshold):
    series = [e.concat_data.series_after(attribute, lag=lag, length=length)
              for e in event_list]
    returns = [s.sum() for s in series]
    print('Full sample')
    print('Size:', len(returns))
    print('Mean:', np.mean(returns))
    print(stats.ttest_1samp(returns, 0)[1])
    print()
    wo_outliers = [r for r in returns if abs(r) < outlier_abs_threshold]
    print('Without outliers')
    print('Size: ', len(wo_outliers))
    print('Mean:', np.mean(wo_outliers))
    print(stats.ttest_1samp(wo_outliers, 0)[1])
Example #12
0
def pairedt(pairs, numSamples):
    results = dict()
    t,v = pairs.items()
    diffs = [t[1][x] - v[1][x] for x in range(len(t[1]))]
    plotDiffs(diffs)
    sampleSize = int(len(diffs)/numSamples)
    indices = range(len(diffs))
    random.shuffle(indices)
    mean_diffs = []
    i = 0
    for sample in range(numSamples):
        total_diff = 0
        for x in range(sampleSize):
            index = indices[i]
            total_diff += diffs[index]
            i+=1
        sample_avg = total_diff/float(sampleSize)
        mean_diffs.append(sample_avg)

    #normality check
    nt = stats.normaltest(mean_diffs)
    results['normal_p'] =  format(round(nt[1],4))

    #ttest
    t_prob = stats.ttest_1samp(mean_diffs, 0)
    results['ttest_t'] =  format(round(t_prob[0],4))
    results['ttest_p'] =  format(round(t_prob[1],4))

    #other stats
    results['avg_diff'] =  format(round(np.mean(diffs),4))
    results['numSamples'] = numSamples
    results['sampleSize'] = sampleSize
    results['num_pairs'] = len(pairs['tor'])

    return results
Example #13
0
def linear_harvey_collier(res):
    '''Harvey Collier test for linearity

    The Null hypothesis is that the regression is correctly modeled as linear.

    Parameters
    ----------
    res : Result instance

    Returns
    -------
    tvalue : float
        test statistic, based on ttest_1sample
    pvalue : float
        pvalue of the test

    Notes
    -----
    TODO: add sort_by option

    This test is a t-test that the mean of the recursive ols residuals is zero.
    Calculating the recursive residuals might take some time for large samples.

    '''
    #I think this has different ddof than
    #B.H. Baltagi, Econometrics, 2011, chapter 8
    #but it matches Gretl and R:lmtest, pvalue at decimal=13
    rr = recursive_olsresiduals(res, skip=3, alpha=0.95)
    from scipy import stats

    return stats.ttest_1samp(rr[3][3:], 0)
Example #14
0
 def ttest(self):
     
     t =[]
     p = []
           
     
     archaeology = self.arc
     background = self.bac
     
     print 'ttestloop'
     print archaeology.shape
     print background.shape
     
     t_test = stats.ttest_1samp(archaeology, background)
     #t_test = stats.ttest_ind(archaeology, background)
     t.append(t_test[0])
     p.append(t_test[1])
         
     t_list = np.array(t)
     print 'TSHAPE', t_list.shape
     
     
     p_list = np.array(p)
     print 'PSHAPE', p_list.shape
     
            
     os.chdir(self.plot_dir)
     np.savetxt(self.name+'_t.txt',np.mean(t_list, axis=1), delimiter=',')
     np.savetxt(self.name+'_p.txt',np.mean(p_list, axis=1), delimiter=',')
Example #15
0
def is_directionally_biased(ase, gene, bias_direction=None, style='ttest', ase_level=0.33,
                            min_slices=10, too_few_slices_val=99,
                            frac_for_biased=0.65, two_tailed=False, alpha=.05):
    if bias_direction is None:
        bias_direction = [1 for col in ase.columns]
    genotypes = {col.split('_')[0] for col in ase.columns}
    biases = {}
    for genotype in genotypes:
        genease = (ase.ix[gene] * bias_direction).select(startswith(genotype))
        if style == 'ttest':
            tstat, pval = ttest_1samp(genease, 0, nan_policy='omit')
            if isinstance(pval, np.ma.core.MaskedConstant):
                biases[genotype] = too_few_slices_val
                continue
            if two_tailed:
                biases[genotype] = np.sign(tstat) * (pval * len(ase) < alpha)

            else:
                pval = pval/2 if tstat > 0 else 1-pval/2
                biases[genotype] = pval * len(ase)  < alpha

        elif style == 'cutoff':
            slices_with_aseval = genease.count()
            if slices_with_aseval < min_slices:
                biases[genotype] = too_few_slices_val
                continue
            biases[genotype] = 0
            for dir in [-1, 1]:
                if ((dir * genease > ase_level).sum()
                    > max(frac_for_biased * slices_with_aseval, min_slices)):
                    biases[genotype] = dir
                    break
        else:
            raise NotImplementedError("Don't know how to use test style '{}'".format(style))
    return biases
def arg_lys_ratio_comp(gene_list, hit_list):
    """ returns a letter for the one sample t-test of RK ratio:
    Z - all hit RK ratios are the same, can't runt the test
    S - no significant difference
    A - psychrophilic query is significantly different and higher than targets
    B - psychrophilic query is significantly different and lower than targets
    N - RK ratio for query couldn't be calculated due to K = 0 for query
    """
    rk_q = gene_list[43]
    if rk_q != 'N/A':  # check if RK ratio was not calculated due to K = 0
        rk_q = float(rk_q)
        # remove all the RK ratios where there were no lysines (i.e. N/A)
        rk_t = [float(i[43]) for i in hit_list if i[43] != 'N/A']
        # if rest of the numbers are the same (1), or there are less than 3
        # numbers (2) or all values are N/A and an empty list is returned (0)
        # then return Z
        if len(set(rk_t)) <= 2:
            return 'Z'
        with np.errstate(divide='ignore'):
            p_value = stats.ttest_1samp(rk_t, rk_q)[1]
        if p_value <= 0.05:
            if rk_q > np.mean(rk_t):
                return 'A'
            else:
                return 'B'
        else:
            return 'S'
    else:
        return 'N'
Example #17
0
    def ttest(self, threshold_dict=None):
        """ Calculate one sample t-test across each voxel (two-sided)

        Args:
            self: Brain_Data instance
            threshold_dict: a dictionary of threshold parameters {'unc':.001} or {'fdr':.05}

        Returns:
            out: dictionary of regression statistics in Brain_Data instances {'t','p'}
        
        """ 

        # Notes:  Need to add FDR Option

        t = deepcopy(self)
        p = deepcopy(self)
        t.data, p.data = ttest_1samp(self.data, 0, 0)

        if threshold_dict is not None:
            if type(threshold_dict) is dict:
                if 'unc' in threshold_dict:
                    #Uncorrected Thresholding
                    t.data[np.where(p.data>threshold_dict['unc'])] = np.nan
                elif 'fdr' in threshold_dict:
                    pass
            else:
                raise ValueError("threshold_dict is not a dictionary.  Make sure it is in the form of {'unc':.001} or {'fdr':.05}")

        out = {'t':t, 'p':p}

        return out
Example #18
0
def paired_data():
    '''Analysis of paired data
    Compare mean daily intake over 10 pre-menstrual and 10 post-menstrual days (in kJ).'''
    
    # Get the data:  daily intake of energy in kJ for 11 women
    data = getData('altman_93.txt', subDir=r'..\Data\data_altman')
    
    mean(data, axis=0)
    std(data, axis=0, ddof=1)
    
    pre = data[:,0]
    post = data[:,1]
    
    # --- >>> START stats <<< ---
    # paired t-test: doing two measurments on the same experimental unit
    # e.g., before and after a treatment
    t_statistic, p_value = stats.ttest_1samp(post - pre, 0)
    
    # p < 0.05 => alternative hypothesis:
    # the difference in mean is not equal to 0
    print(("paired t-test", p_value))
    
    # alternative to paired t-test when data has an ordinary scale or when not
    # normally distributed
    rankSum, p_value = stats.wilcoxon(post - pre)
    # --- >>> STOP stats <<< ---
    print(("Wilcoxon-Signed-Rank-Sum test", p_value))
    
    return p_value # should be 0.0033300139117459797
Example #19
0
def testPermutation(testResult, permutationResult):
    '''
    Method that essentially runs a t-test to determine if the test result
    has significantly better error than the permutation set
    '''
    # test data
    trueTest = testResult[:, 0]
    predTest = testResult[:, 1]
    # permutation data
    truePermut = permutationResult[:, 0, :]
    predPermut = permutationResult[:, 1, :]

    # Get the MSE for the empirical values
    empMSE = np.mean(np.square(trueTest - predTest))
    # Now make a distribution of MSE from the permutations
    distMSE = np.mean(np.square(truePermut - predPermut), axis=0)
    # Run a one sample t-test on this thing - if significant, then our
    # empirical MSE is significantly different from the permuted one
    tValue, pValue = st.ttest_1samp(distMSE, empMSE)
    # Do zarrars p value
    smallerEmp = distMSE < empMSE
    nSmaller = np.sum(smallerEmp)
    pZarrar = float(nSmaller) / len(distMSE)

    # return empMSE, distMSE, tValue, pValue
    return empMSE, distMSE, pZarrar
Example #20
0
def xcorr_stability(x1,x2,window_len=200,overlap=190):
    """Go through the arrays x1,x2 in a moving-window-fashion.
    For each window, calculate the crosscorrelation function,
    determine tau (time delay).
    Then, for the tau of all windows, calculate t-stats to test
    H0 that the taus are zero. If not, we have sync (and 
    directionality?)
    """
    if not len(x1.shape)==len(x2.shape)==1:
        raise ValueError("Input arrays must be 1d")
    if not x1.shape[0]==x2.shape[0]:
        raise ValueError("Input arrays must have same length.")
    #TODO: check sanity of window_len and overlap
    ndp = x1.shape[0] #number of datapoints
    taus = np.zeros((ndp-window_len)/(window_len-overlap))
    start = 0
    for i in range(len(taus)): #(0,ndp-window_len,window_len-overlap)):
        ar1 = x1[start:start+window_len]
        ar2 = x2[start:start+window_len]
        xcorr = np.correlate(ar1,ar2,mode="same")
        tau = xcorr.argmax()-xcorr.shape[0]/2
        taus[i]=tau
        start += window_len-overlap
    #print taus, taus.mean(),taus.std()
    t,p = ttest_1samp(taus,0)
    return (t,p), taus, np.tanh(t)
Example #21
0
 def summarize(self, path):
     self.path = path
     """
     Function who saves the union of subject's map to easily see it
     and the mean map across subjects
     """     
     
     threshold_value = self._p_value
     total_map = self._total_map
     affine = self._affine
     radius = self._radius
         
     total_map = np.rollaxis(np.array(total_map), 0, 4)
     total_img = ni.Nifti1Image(total_map, affine=affine)
     
     fname = "accuracy_map_radius_%s_searchlight_all_subj.nii.gz" % radius
     ni.save(total_img, os.path.join(self.path,fname))
     
     mean_img = ni.Nifti1Image(total_map.mean(3), affine=affine)
     
     fname = "accuracy_map_radius_%s_searchlight_mean_subj.nii.gz" % radius
     ni.save(mean_img, os.path.join(self.path,fname))         
     
     logging.info('Summarizer writed in '+self.path)
             
     t, p = ttest_1samp(total_map, threshold_value, axis=3)
     
     fname = "t_map_vs_threshold_%s_uncorrected.nii.gz" % threshold_value 
     _img = ni.Nifti1Image(t, affine=affine)
     ni.save(_img, os.path.join(self.path, fname))
     
     fname = "p_map_vs_threshold_%s_uncorrected.nii.gz" % threshold_value 
     _img = ni.Nifti1Image(p, affine=affine)
     ni.save(_img, os.path.join(self.path, fname))  
def compareWithNormal():
    '''This function is supposed to give you an idea how big/small the difference between t- and normal
    distribution are for realistic calculations.
    '''

    # generate the data
    np.random.seed(12345)
    normDist = stats.norm(loc=7, scale=3)
    data = normDist.rvs(100)
    checkVal = 6.5

    # T-test
    # --- >>> START stats <<< ---
    t, tProb = stats.ttest_1samp(data, checkVal)
    # --- >>> STOP stats <<< ---

    # Comparison with corresponding normal distribution
    mmean = np.mean(data)
    mstd = np.std(data, ddof=1)
    normProb = stats.norm.cdf(checkVal, loc=mmean,
            scale=mstd/np.sqrt(len(data)))*2

    # compare
    print(('The probability from the t-test is ' + '{0:5.4f}, and from the normal distribution {1:5.4f}'.format(tProb, normProb)))
    
    return normProb # should be 0.054201154690070759
Example #23
0
def match_to_key_fgrams_paired(raw_fgram, key_fgrams, floworder):
    """
    @TODO: jhoon
    """
    from scipy import stats
    pvals = []
    key_pairings = []
    n_nucs = len(floworder)
    for keyindex,kf in enumerate(key_fgrams):
        pairs = {}
        for nuc in floworder:
            pairs[nuc] = [None,None]
        for ndx,expected in enumerate(kf):
            nuc = floworder[ndx%n_nucs]
            pairs[nuc][int(expected)] = ndx
        topop = [k for k,v in pairs.iteritems() if None in v]
        for k in topop:
            pairs.pop(k)
        diffs = []
        for k,v in pairs.iteritems():
            #random.shuffle(v)
            diffs.append(raw_fgram[v[1]] - raw_fgram[v[0]])
        t,pval2side = stats.ttest_1samp(diffs, 0.0)
        pval2side /= 2
        pvals.append((pval2side,keyindex))
    pvals.sort()
    return pvals[0][-1]
    def test_weightstats_3(self):
        x1_2d, x2_2d = self.x1_2d, self.x2_2d
        w1, w2 = self.w1, self.w2

        d1w_2d = DescrStatsW(x1_2d, weights=w1)
        d2w_2d = DescrStatsW(x2_2d, weights=w2)
        x1r_2d = d1w_2d.asrepeats()
        x2r_2d = d2w_2d.asrepeats()

        assert_almost_equal(x2r_2d.mean(0), d2w_2d.mean, 14)
        assert_almost_equal(x2r_2d.var(0), d2w_2d.var, 14)
        assert_almost_equal(x2r_2d.std(0), d2w_2d.std, 14)
        assert_almost_equal(np.cov(x2r_2d.T, bias=1), d2w_2d.cov, 14)
        assert_almost_equal(np.corrcoef(x2r_2d.T), d2w_2d.corrcoef, 14)

#        print d1w_2d.ttest_mean(3)
#        #scipy.stats.ttest is also vectorized
#        print stats.ttest_1samp(x1r_2d, 3)
        t, p, d = d1w_2d.ttest_mean(3)
        assert_almost_equal([t, p], stats.ttest_1samp(x1r_2d, 3), 11)
        # print [stats.ttest_1samp(xi, 3) for xi in x1r_2d.T]
        cm = CompareMeans(d1w_2d, d2w_2d)
        ressm = cm.ttest_ind()
        resss = stats.ttest_ind(x1r_2d, x2r_2d)
        assert_almost_equal(ressm[:2], resss, 14)
Example #25
0
def gen_bonferroni_corrected_graph(indata, alpha = .01):
    """ given indata (4D array subjects X nnodes X nnodes)
    and alpha (rejection level for bonferroni correction)
    calculate significant connections across cohort
    return thresholded t-value map and binary map
    """

    data = np.load(indata)
    data = data.squeeze() # remove singular dims (eg we only have 1 block)
    ind = util.triu_indices(data.shape[-1], 1)# ind of upper tri minus diag
    lowind = util.tril_indices(data.shape[-1], -1) # ind of lower triag minusdiag
    tvals, pvals = ss.ttest_1samp(data, 0, axis = 0) ## rfx ttest across subjects
    ## for each region
    (reject, pvals_corrected, 
    alphacSidak, alphacBonf) = sms.multipletests(pvals[ind], 
                                                 alpha = alpha, 
                                                 method='bonferroni')
    (lreject, lpvals_corrected, 
     lalphacSidak, lalphacBonf) = sms.multipletests(pvals[lowind], 
                                                    alpha = alpha, 
                                                    method='bonferroni')
    sparsity = calc_sparsity(reject)
    print 'sparsity', sparsity

    mask = np.zeros(tvals.shape, dtype = np.bool)
    mask[ind] = reject
    mask[lowind] = lreject
    mask[tvals < 0] = False # remove negative correlations
    tvals[mask == False] = 0
    return mask, tvals, sparsity
Example #26
0
    def new_returning(self):
        print("\n\n"
              "##############################"
              "Sentiments of (New - Returning) WAs"
              "##################################")

        new_wa = self.data[(self.data.status == 'NEW') & self.data.s_value]
        ret_wa = self.data[(self.data.status == 'RET') & self.data.s_value]
        pair_deltas = []

        n_ids = new_wa['ID'].values
        r_ids = ret_wa['ID'].values

        for id in n_ids:
            if id in r_ids:
                t = new_wa[new_wa.ID == id].s_value.values
                s = ret_wa[ret_wa.ID == id].mean(axis=0).s_value
                pair_deltas.append(t-s)


        self.print_descriptively(label="Numbers for New WAs", data=(len(n_ids),new_wa.mean().s_value,new_wa.var().s_value))
        self.print_descriptively(label="Numbers for Returning WAs", data=(len(r_ids),ret_wa.mean().s_value,ret_wa.var().s_value))


        new_male_was = self.data[(self.data.ID in n_ids) & (self.data.gender == 'm')].s_value
        new_female_was = self.data(self.data.ID in r_ids) & (self.data.gender == 'f')].s_value
        # mean_pair_deltas = np.mean(pair_deltas)
        # var_pair_deltas = np.std(pair_deltas,ddof=1,dtype=np.float64)
        # t_pair_deltas = mean_pair_deltas/(var_pair_deltas/sqrt(len(pair_deltas)+0.0))
        t_pair_deltas, p_value_pair_deltas = stats.ttest_1samp(pair_deltas, 0)

        self.print_descriptively(label="New vs Returning WAs",
                                 data=(len(pair_deltas),np.mean(pair_deltas),np.var(pair_deltas)))
Example #27
0
def test_permutation_t_test():
    """Test T-test based on permutations."""
    # 1 sample t-test
    np.random.seed(10)
    n_samples, n_tests = 30, 5
    X = np.random.randn(n_samples, n_tests)
    X[:, :2] += 1

    T_obs, p_values, H0 = permutation_t_test(X, n_permutations=999, tail=0)
    is_significant = p_values < 0.05
    assert_array_equal(is_significant, [True, True, False, False, False])

    T_obs, p_values, H0 = permutation_t_test(X, n_permutations=999, tail=1)
    is_significant = p_values < 0.05
    assert_array_equal(is_significant, [True, True, False, False, False])

    T_obs, p_values, H0 = permutation_t_test(X, n_permutations=999, tail=-1)
    is_significant = p_values < 0.05
    assert_array_equal(is_significant, [False, False, False, False, False])

    X = np.random.randn(18, 1)
    T_obs, p_values, H0 = permutation_t_test(X[:, [0]], n_permutations='all')
    T_obs_scipy, p_values_scipy = stats.ttest_1samp(X[:, 0], 0)
    assert_almost_equal(T_obs[0], T_obs_scipy, 8)
    assert_almost_equal(p_values[0], p_values_scipy, 2)
Example #28
0
def check_mean():        
    '''Data from Altman, check for significance of mean value.
    Compare average daily energy intake (kJ) over 10 days of 11 healthy women, and
    compare it to the recommended level of 7725 kJ.
    '''
    # Get data from Altman

    data = getData('altman_91.txt')

    # Watch out: by default the SD is calculated with 1/N!
    myMean = np.mean(data)
    mySD = np.std(data, ddof=1)
    print 'Mean and SD: {0:4.2f} and {1:4.2f}'.format(myMean, mySD)

    # Confidence intervals
    tf = stats.t(len(data)-1)
    ci = np.mean(data) + stats.sem(data)*np.array([-1,1])*tf.isf(0.025)
    print 'The confidence intervals are {0:4.2f} to {1:4.2f}.'.format(ci[0], ci[1])

    # Check for significance
    checkValue = 7725
    t, prob = stats.ttest_1samp(data, checkValue)
    if prob < 0.05:
        print '{0:4.2f} is significantly different from the mean (p={1:5.3f}).'.format(checkValue, prob)

    # For not normally distributed data, use the Wilcoxon signed rank test
    (rank, pVal) = stats.wilcoxon(data-checkValue)
    if pVal < 0.05:
      issignificant = 'unlikely'
    else:
      issignificant = 'likely'
      
    print 'It is ' + issignificant + ' that the value is {0:d}'.format(checkValue)
    def quantify(self, samples=None):
        """
        Get average ratio and p-value from ratio t-test
        Ratio t-test: log-transform ratios, then t-test against 0
        samples: a subset of samples to quantify across e.g if both heavy and
        light samples included, it only makes sense to average ratios across one
        set or the other.
        """
        from scipy import stats

        if samples:
            ratios = [q['ratio'] for s,q in self.quantification.items() if s in samples]
        else:
            ratios = [q['ratio'] for q in self.quantification.values()]

        # Filter out 0 and NaN
        ratios = np.array([r for r in ratios if r != 0])
        ratios = ratios[~np.isnan(ratios)]

        log_ratios = np.log(ratios)

        t, p = stats.ttest_1samp(log_ratios, 0)

        self.avg_ratio = np.mean(ratios)
        self.p_value = p
Example #30
0
def one_sample_ttest(category,base):
    rating,population=load_rating_data(category)
    rating = np.array(rating)
    population=np.array(population)
    t4, prob4 = stats.ttest_1samp(rating,base)
    print("t value of "+category+str(t4))
    print("p value of "+category+str(prob4))
out_file = 'BMI_data_China.svg'
show_data(out_file, out_dir='.')

# Plot for the 1-sample T-test
plt.plot(samples['China'], 'o', label='China')
plt.hlines(25, 0, num_samples, linestyles='--')
plt.xlim(0, 50)
plt.xlabel('Subject-Nr')
plt.ylabel('BMI')
plt.legend()
plt.tight_layout()
out_file = 'BMI_China.jpg'
show_data(out_file, out_dir='.')

# ... and do the corresponding T-test
t, p = stats.ttest_1samp(samples['China'], popmean=25)
print(f'Is China just at the limit of over-weight (BMI=25)? p={p}')

# Comparison between two independent groups
plt.plot(samples['Germany'], 'b*', ms=3, label='Germans')
plt.hlines(np.mean(samples['Germany']),
           0,
           num_samples,
           linestyles='--',
           color='b',
           label='mean-Germany')

plt.plot(np.arange(num_samples, 2 * num_samples),
         samples['Austria'],
         'ro',
         ms=3,
Example #32
0
    def compute_transitions(self):
        # analogous code using networkx
        # membership = adata.obs['clusters'].cat.codes.tolist()
        # partition = defaultdict(list)
        # for n, p in zip(list(range(len(G))), membership):
        #     partition[p].append(n)
        # partition = partition.values()
        # g_abstracted = nx.quotient_graph(g, partition, relabel=True)
        # for some reason, though, edges aren't oriented in the quotient
        # graph...
        import igraph
        g = utils.get_igraph_from_adjacency(
            self._adata.uns['velocyto_transitions'], directed=True)
        vc = igraph.VertexClustering(
            g, membership=self._adata.obs[self._groups_key].cat.codes.values)
        cg_full = vc.cluster_graph(combine_edges=False)

        g_bool = utils.get_igraph_from_adjacency(
            self._adata.uns['velocyto_transitions'].astype('bool'), directed=True)
        vc_bool = igraph.VertexClustering(
            g_bool, membership=self._adata.obs[self._groups_key].cat.codes.values)
        cg_bool = vc_bool.cluster_graph(combine_edges='sum')  # collapsed version
        transitions = utils.get_sparse_from_igraph(cg_bool, weight_attr='weight')
        # translate this into a confidence measure
        # the number of outgoing edges
        # total_n = np.zeros(len(vc.sizes()))
        # # (this is not the convention of standard stochastic matrices)
        # total_outgoing = transitions.sum(axis=1)
        # for i in range(len(total_n)):
        #     total_n[i] = vc.subgraph(i).ecount()
        #     total_n[i] += total_outgoing[i, 0]
        # use the topology based reference, the velocity one might have very small numbers
        total_n = self._neighbors.n_neighbors * np.array(vc_bool.sizes())
        transitions_ttest = transitions.copy()
        transitions_confidence = transitions.copy()
        from scipy.stats import ttest_1samp
        for i in range(transitions.shape[0]):
            # no symmetry in transitions, hence we should not restrict to
            # upper triangle
            neighbors = transitions[i].nonzero()[1]
            for j in neighbors:
                forward = cg_full.es.select(_source=i, _target=j)['weight']
                backward = cg_full.es.select(_source=j, _target=i)['weight']
                # backward direction: add minus sign
                values = np.array(list(forward) + list(-np.array(backward)))
                # require some minimal number of observations
                if len(values) < 5:
                    transitions_ttest[i, j] = 0
                    transitions_ttest[j, i] = 0
                    transitions_confidence[i, j] = 0
                    transitions_confidence[j, i] = 0
                    continue
                t, prob = ttest_1samp(values, 0.0)
                if t > 0:
                    # number of outgoing edges greater than number of ingoing edges
                    # i.e., transition from i to j
                    transitions_ttest[i, j] = -np.log10(max(prob, 1e-10))
                    transitions_ttest[j, i] = 0
                else:
                    transitions_ttest[j, i] = -np.log10(max(prob, 1e-10))
                    transitions_ttest[i, j] = 0
                # geom_mean
                geom_mean = np.sqrt(total_n[i] * total_n[j])
                diff = (len(forward) - len(backward)) / geom_mean
                if diff > 0:
                    transitions_confidence[i, j] = diff
                    transitions_confidence[j, i] = 0
                else:
                    transitions_confidence[j, i] = -diff
                    transitions_confidence[i, j] = 0
        transitions_ttest.eliminate_zeros()
        transitions_confidence.eliminate_zeros()
        # transpose in order to match convention of stochastic matrices
        # entry ij means transition from j to i
        self.transitions_ttest = transitions_ttest.T
        self.transitions_confidence = transitions_confidence.T
print(df.tail())
print(df.columns) # extracting column names
print(df.index)   # extracting row names or the index
print(df.T)       # transpose data
print(df.sort)    #
print(df.ix[:,0].head()) # extracting a specific column
print(df.ix[10:20,0:3])
print(df.drop(df.columns[[1,2]], axis = 1).head())
 #axis tells the function to drop with respect to columns,if axis=0, then the index

 #Descriptive Statistics
print(df.describe())

 #Hypothesis testing
 #perform one sample t-test using 1500 as the true mean
print(ss.ttest_1samp(a=df.ix[:,'Abra'],popmean=1500))

#Visulization
matplotlib.rcdefaults()

plt.show(df.plot(kind = 'box'))

pd.options.display.mpl_style = 'default' # Sets the plotting display theme to ggplot2
df.plot(kind = 'box')

sns.boxplot(data=df,width=0.5)
sns.violinplot(df,width=3.5)

plt.show(sns.distplot(df.ix[:,2], rug = True, bins = 15))

with sns.axes_style("white"):
Example #34
0
from scipy import stats
from scipy.stats import t

__author__ = 'zzt'


def ttest(a, mu):
    ave = mean(a)
    s = std(a, ddof=1)
    tv = (ave - mu) / s * sqrt(len(a))
    p = t.sf(abs(tv), len(a) - 1) * 2
    return [tv, p]


class Solution():
    def ttest_1samp(self, a, popmean):
        if len(a) == 0:
            return [None, None]
        return self.res(ttest(a, popmean))

    def res(self, a):
        return [round(x, 6) for x in a]


if __name__ == '__main__':
    rvs = [2, 2, 3]
    print(stats.ttest_1samp(rvs, 2))
    # print(ttest(rvs, 2))
    s = Solution()
    print(s.ttest_1samp(rvs, 2))
Example #35
0
import numpy as np
import csv
from scipy import stats

# I. Load data
iq_data = np.loadtxt("iqdata.csv")
iq1 = iq_data[0:10000]

f = open("testcaseiq.txt")
nooftestcases = f.readline().strip()

for i in range(1, int(nooftestcases) + 1):
    with open("output{}.csv".format(i), "w") as out:
        writer = csv.writer(out, delimiter=",")
        writer = csv.writer(out)
        writer.writerow([str(round(np.mean(iq1), 2))])
        writer.writerow([str(round(np.std(iq1), 2))])
        lower_value = f.readline().strip()
        upper_value = f.readline().strip()
        probability = np.subtract(
            stats.norm(np.mean(iq1), np.std(iq1)).cdf(int(upper_value)),
            stats.norm(np.mean(iq1), np.std(iq1)).cdf(int(lower_value))) * 100
        writer.writerow([str(np.round(probability, 3))])
        file = f.readline().strip()
        sample = pd.read_csv("{}.csv".format(file))
        p_value = stats.ttest_1samp(a=sample, popmean=np.mean(iq1))
        if p_value[1][0] < 0.05:
            writer.writerow(["Reject"])
        else:
            writer.writerow(["Accept"])
Example #36
0
def check_sample_mean(sample, popmean):
    # Checks for unlikely difference between sample mean and population mean
    prob = stats.ttest_1samp(sample, popmean).pvalue
    assert prob > 0.01
Example #37
0
def Ses_test(Yt, columns):
    seasonal = [False, False, False, False]
    #    index = Yt.ne(0).idxmax()
    #    Yt = Yt[index:]
    #    columns=columns[index:]
    Yt = Yt.reset_index(drop=True)
    Yt, Ses_3 = Out_function_1(Yt)
    Yt[np.where((Yt == 0) == True)[0]] = 1
    #Test 1: Statistical test of seasonal Index
    if len(Yt) > 5:
        try:
            result = seasonal_decompose(Yt, model='multiplicative', freq=5)
        except:
            result = seasonal_decompose(Yt, model='additive', freq=5)
        seasonal_index = result.seasonal
        f, p = stats.ttest_1samp(seasonal_index, 1)
        if p < 0.05:
            seasonal[0] = True
    #Test 2: Frequency and acf
    res = acf(Yt)
    ub = 1.96 / np.sqrt(len(Yt))
    for i in range(1, len(res) - 1):
        if (res[i] > ub and res[i + 1] < ub):
            p = i
            if (p > 12):
                p = 12
            break
    else:
        p = 12
    d = {'date': columns, 'data': Yt}
    ts_data = pd.DataFrame(d)
    ts_data.set_index('date', inplace=True)

    # Test 1: periodogram
    # estimate spectral density

    freq = [0] * len(Yt)
    freq[0] = 12 / len(Yt)
    for i in range(1, len(Yt)):
        freq[i] = freq[i - 1] + freq[0]
    freq = np.array(freq)

    f, spec = signal.periodogram(Yt)
    freq = freq[:len(spec)]
    freq_max = max(freq)
    ind = np.where(freq == freq.max())
    if freq_max < p + 1.5 and freq_max > p - 1.5:
        seasonal[1] = True
    #    print(freq)

    #f,spec=scipy.signal.welch(Yt,fs=100,scaling='density')
    #f,spec=signal.periodogram(Yt,nfft=None,return_onesided = True,scaling = "density",detrend='constant')
    # select higher frequencies
#    bool= (freq > 0.5 )
#    spec = spec[bool]
#    freq = freq[bool]
#    id=np.where(spec == spec.max())[0]
#    if len(id)>1:
#        id=id[0]
#    freq = freq[id]
#    if freq > 0.85 and freq < 1.15:
#        seasonal[1] = True
# Test 2: auto-correlation function
    try:
        Tt = stldecompose.decompose(ts_data).trend  #extract the trend element
    except:
        Tt = [None] * len(Yt)
    Tt = Tt.reset_index(drop=True)['data']
    if sum(Tt.isnull()) == 0:
        At = Yt - Tt  # detrend time series
        acf_val = acf(At)
        lag_val = [0] * 75
        lag_val[0] = 0
        for i in range(1, len(lag_val)):
            lag_val[i] = lag_val[i - 1] + (1 / 12)
        ind = np.where(acf_val == np.min(acf_val))[0]
        lag = lag_val[ind[0]]
        if lag < p + 1.5 and lag < p - 1.5:
            seasonal[2] = True
        else:
            seasonal[2] = False
    # Test 3: seasonal model
    #seasonal ---> cycle()
    seas = [0] * len(Yt)
    j = 1
    for i in range(len(Yt)):
        seas[i] = j
        j += 1
        if j > 12:
            j = 1
    #trend --> time()
    trend = [0] * len(Yt)
    trend[0] = 1
    for i in range(1, len(Yt)):
        trend[i] = trend[i - 1] + (1 / 12)

    d = {'Yt': Yt, 'seas': seas, 'trend': trend}
    df = pd.DataFrame(d)
    X = df[["seas", "trend"]]
    y = df["Yt"]
    m1 = sm.OLS(y, X).fit()
    X1 = df[["trend"]]
    m2 = sm.OLS(y, X1).fit()
    bic = [m1.bic, m2.bic]
    arrind = np.where(bic == np.min(bic))[0][0]
    bic_min = bic[arrind]
    if arrind == 0:
        seasonal[3] = True
    return seasonal
Example #38
0
plt.show()

#scipy
#justpaste.it/6ov7y
#justpaste.it/5woh8

#test hypothesis
#Hypothesis is a claim which can be true or falls
#According to KQ the mean flight transaction is 56000.
#null hypothesis(Ho) - the mean(flight trans) is equal to 56000
#alternative hypothesis(H1)- our sample mean is not equal to 56000

import scipy  #install
from scipy.stats import ttest_1samp

statistics, pvalue = ttest_1samp(df['FlightTrans'], 56000)
print('p value is :', pvalue)
alpha = 0.05
if pvalue < alpha:
    print('Reject Null Hypothesis')
    print('Accept the alternative')
    print('Alternative Hypothesis(H1)-our sample mean is not equal to 56000')
else:
    print('Accept Null Hypothesis')
    print(
        'Null Hypothesis(H0) - Our sample mean(FlightTrans) is equal to 56000')

#sample, ANOVA, Chi square
#work on something ,,, get any data set , do  A few plots
#on eithher classification, regresssion or clustering
#1 page document explaining your work and a link to your code,
Example #39
0
import random
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt

x = random.sample(range(0, 501), 100)
print x

y = random.sample(range(0, 501), 100)
print y

# The following code tests our random sample x against the mean of 250.

print 'H0 = There is no statistical significance between the random sample x and a mean of the value 250.'
print 'HA = There is a statistical significance between the random sample x and a mean of the value 250.'
ttest_sampx = ss.ttest_1samp(x, 250)
print ttest_sampx

# The following code uses the output of the t-test to print the correct analysis.

if ttest_sampx[1] > 0.05:
    print "The H0 is not rejected. There is no statistical evidence against the H0."
elif ttest_sampx[1] < 0.01:
    print "H0 is rejected. There is statistical evidence for the H0."
else:
    print "There is a weak correlation between the samples x and a mean of 250, therefore the H0 is rejected."
print "__________________________________"

# The following code tests our random sample y against the mean of 250.

print "___________________________________"
Example #40
0
# -*- coding: utf-8 -*-
"""
Created on Wed Jul  4 23:09:05 2018

@author: nandavari
"""
from scipy.stats import norm
import numpy as np
print(norm.cdf(np.array([1, -1, 0, 1, 3, 4, -2, 6])))

#To find the median of a distribution, we can use the Percent Point Function (PPF), which is the inverse of the CDF
from scipy.stats import norm
print(norm.ppf(0.5))

#To generate a sequence of random variates  use size keyword args
from scipy.stats import norm
print(norm.rvs(size=5))

#Uniform Distribution
#uniform distribution can be generated using the uniform function
from scipy.stats import uniform
print(uniform.cdf([0, 1, 2, 3, 4, 5], loc=1, scale=4))

#ttest_1samp
from scipy import stats
rvs = stats.norm.rvs(loc=5, scale=10, size=(50, 2))
print(stats.ttest_1samp(rvs, 5.0))
Example #41
0
 def compute_transitions_old(self):
     import igraph
     g = utils.get_igraph_from_adjacency(
         self._adata.uns['velocyto_transitions'], directed=True)
     vc = igraph.VertexClustering(
         g, membership=self._adata.obs[self._groups_key].cat.codes.values)
     # this stores all single-cell edges in the cluster graph
     cg_full = vc.cluster_graph(combine_edges=False)
     # this is the boolean version that simply counts edges in the clustered graph
     g_bool = utils.get_igraph_from_adjacency(
         self._adata.uns['velocyto_transitions'].astype('bool'),
         directed=True)
     vc_bool = igraph.VertexClustering(
         g_bool,
         membership=self._adata.obs[self._groups_key].cat.codes.values)
     cg_bool = vc_bool.cluster_graph(
         combine_edges='sum')  # collapsed version
     transitions = utils.get_sparse_from_igraph(cg_bool,
                                                weight_attr='weight')
     total_n = self._neighbors.n_neighbors * np.array(vc_bool.sizes())
     transitions_ttest = transitions.copy()
     transitions_confidence = transitions.copy()
     from scipy.stats import ttest_1samp
     for i in range(transitions.shape[0]):
         neighbors = transitions[i].nonzero()[1]
         for j in neighbors:
             forward = cg_full.es.select(_source=i, _target=j)['weight']
             backward = cg_full.es.select(_source=j, _target=i)['weight']
             # backward direction: add minus sign
             values = np.array(list(forward) + list(-np.array(backward)))
             # require some minimal number of observations
             if len(values) < 5:
                 transitions_ttest[i, j] = 0
                 transitions_ttest[j, i] = 0
                 transitions_confidence[i, j] = 0
                 transitions_confidence[j, i] = 0
                 continue
             t, prob = ttest_1samp(values, 0.0)
             if t > 0:
                 # number of outgoing edges greater than number of ingoing edges
                 # i.e., transition from i to j
                 transitions_ttest[i, j] = -np.log10(max(prob, 1e-10))
                 transitions_ttest[j, i] = 0
             else:
                 transitions_ttest[j, i] = -np.log10(max(prob, 1e-10))
                 transitions_ttest[i, j] = 0
             # geom_mean
             geom_mean = np.sqrt(total_n[i] * total_n[j])
             diff = (len(forward) - len(backward)) / geom_mean
             if diff > 0:
                 transitions_confidence[i, j] = diff
                 transitions_confidence[j, i] = 0
             else:
                 transitions_confidence[j, i] = -diff
                 transitions_confidence[i, j] = 0
     transitions_ttest.eliminate_zeros()
     transitions_confidence.eliminate_zeros()
     # transpose in order to match convention of stochastic matrices
     # entry ij means transition from j to i
     self.transitions_ttest = transitions_ttest.T
     self.transitions_confidence = transitions_confidence.T
###Smote
#Only numeric/boolean and non_null values as input to TSNE model :: BETTER TRY THIS AFTER MISSING VALUE IMPUTATION AND ENCODING
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_train_new, y_train_new = sm.fit_sample(train.dropna().iloc[:, 1:44],
                                         train.dropna()['Dependent_Variable'])

#####Check if sample is representing the population: Central Limit Theorem, https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test

#Hyothesis testing, Degree of Freedom, t-statistics student t-test etc.
#References: http://www.scipy-lectures.org/packages/statistics/index.html#pairplot-scatter-matrices
#scipy.stats.ttest_1samp() tests if the population mean of data is likely to be equal to a given value (technically if observations are drawn from a Gaussian distributions of given population mean). It returns the T statistic, and the p-value

##1-sample ttest
stats.ttest_1samp(data['VIQ'], 0)
stats.ttest_1samp(train['N32'].dropna(), 0)

#2-sample ttest
female_viq = data[data['Gender'] == 'Female']['VIQ']
male_viq = data[data['Gender'] == 'Male']['VIQ']
stats.ttest_ind(female_viq, male_viq)

#paired ttest
stats.ttest_ind(data['FSIQ'], data['PIQ'])
stats.ttest_1samp(data['FSIQ'] - data['PIQ'], 0)

#Skewness and kurtosis
#Skewness is a measure of asymmetry. Kurtosis is a more subtle measure of peakedness compared to a Gaussian distribution.
from scipy.stats import kurtosis, skew
kurtosis(train['N35'].notnull())
Example #43
0
 def tek_orneklem_t_testi(self, beklenen_deger):
     return float(
         stats.ttest_1samp(self.choice_array,
                           popmean=beklenen_deger).pvalue)
    for col in group.columns:

        if col.startswith('k_'):

            # plot the quantiles plot to see if the data is normally distributed
            fig = qqplot(group[col], line='45')
            plot_dir = os.path.join(PATHS['figures_dir'], 'quantile-plots',
                                    event, structure, '{:1.1f}'.format(speed))
            plot_dir = utils.mkdir(plot_dir)
            fig.savefig(os.path.join(plot_dir, '{}.png'.format(col)))
            plt.close(fig)

            # compute the t statistic to see if the value is significantly
            # different than zero
            t_stat, p_val = ttest_1samp(group[col], 0.0)

            index.append(col)
            t_vals.append(t_stat)
            p_vals.append(p_val)

    #mark = np.zeros((num_schedules, num_sensors, num_actuators), dtype=bool)
    mark = np.zeros((20, 6, 12), dtype=bool)

    for gain, p_val in zip(index, p_vals):
        if p_val < 0.05:
            i, j, k = [int(n) for n in gain[2:].split('_')]
            mark[i, j, k] = True

    sig_marks[(speed, event)] = mark
Example #45
0
# seaborn : Matplotlib을 기반으로 다양한 색상 테마와 통계용 차트 등의 기능을 추가한 시각화 패키지이다.

# one samples t-test
# 어느 한 집단의 평균은 0인지 검정하기(난수 사용)
# 귀무 : 자료들의 평균은 0이다.
# 대립 : 자료들의 평균은 0이 아니다.
np.random.seed(123)
mu = 0
n = 10  # 데이터가 많아질 수록 0에 가까워진다. ex) 1000, 10000 등
x = stats.norm(mu).rvs(n)  # norm : 정규분포, rvs : 랜덤 표본 생성
print(x, np.mean(x))  # mean : -0.26951611032632805

# sns.distplot(x, kde=False, rug=True, fit=stats.norm)  # 시각화  # kde=False을 넘겨주면 밀도 그래프를 그리지 않는다.
# plt.show()

result = stats.ttest_1samp(x, popmean=0)  # (데이터, 예상평균값)
print('result : ', result)
# Ttest_1sampResult(statistic=-0.6540040368674593, pvalue=0.5294637946339893) statistic : 검정평균
# 해석 : p-value(0.529463) > 0.05(유의수준) 이므로 귀무 채택.    자료들의 평균은 0이다.

# * 단일 모집단의 평균에 대한 가설검정(one samples t-test)
# 실습 예제 1)
# A중학교 1학년 1반 학생들의 시험결과가 담긴 파일을 읽어 처리 (국어 점수 평균검정) student.csv'
# 귀무 : 국어 점수의 평균은 80이다.
# 대립 : 국어 점수의 평균은 80이 아니다

data = pd.read_csv('../testdata/student.csv')
print(data.head())
print(data.describe())
result2 = stats.ttest_1samp(data.국어, popmean=80)
print('result2 : ', result2)
Example #46
0
def descstats(data, cols=None, axis=0):
    '''
    Prints descriptive statistics for one or multiple variables.

    Parameters
    ------------
    data: numpy array
        `x` is the data

    v: list, optional
        A list of the column number or field names (for a recarray) of variables.
        Default is all columns.

    axis: 1 or 0
        axis order of data.  Default is 0 for column-ordered data.

    Examples
    --------
    >>> descstats(data.exog,v=['x_1','x_2','x_3'])
    '''

    x = np.array(data)  # or rather, the data we're interested in
    if cols is None:
#       if isinstance(x, np.recarray):
#            cols = np.array(len(x.dtype.names))
        if not isinstance(x, np.recarray) and x.ndim == 1:
            x = x[:,None]

    if x.shape[1] == 1:
        desc = '''
    ---------------------------------------------
    Univariate Descriptive Statistics
    ---------------------------------------------

    Var. Name   %(name)12s
    ----------
    Obs.          %(nobs)22i  Range                  %(range)22s
    Sum of Wts.   %(sum)22s  Coeff. of Variation     %(coeffvar)22.4g
    Mode          %(mode)22.4g  Skewness                %(skewness)22.4g
    Repeats       %(nmode)22i  Kurtosis                %(kurtosis)22.4g
    Mean          %(mean)22.4g  Uncorrected SS          %(uss)22.4g
    Median        %(median)22.4g  Corrected SS            %(ss)22.4g
    Variance      %(variance)22.4g  Sum Observations        %(sobs)22.4g
    Std. Dev.     %(stddev)22.4g
    ''' % {'name': cols, 'sum': 'N/A', 'nobs': len(x), 'mode': \
    stats.mode(x)[0][0], 'nmode': stats.mode(x)[1][0], \
    'mean': x.mean(), 'median': np.median(x), 'range': \
    '('+str(x.min())+', '+str(x.max())+')', 'variance': \
    x.var(), 'stddev': x.std(), 'coeffvar': \
    stats.variation(x), 'skewness': stats.skew(x), \
    'kurtosis': stats.kurtosis(x), 'uss': stats.ss(x),\
    'ss': stats.ss(x-x.mean()), 'sobs': np.sum(x)}

#    ''' % {'name': cols[0], 'sum': 'N/A', 'nobs': len(x[cols[0]]), 'mode': \
#    stats.mode(x[cols[0]])[0][0], 'nmode': stats.mode(x[cols[0]])[1][0], \
#    'mean': x[cols[0]].mean(), 'median': np.median(x[cols[0]]), 'range': \
#    '('+str(x[cols[0]].min())+', '+str(x[cols[0]].max())+')', 'variance': \
#    x[cols[0]].var(), 'stddev': x[cols[0]].std(), 'coeffvar': \
#    stats.variation(x[cols[0]]), 'skewness': stats.skew(x[cols[0]]), \
#    'kurtosis': stats.kurtosis(x[cols[0]]), 'uss': stats.ss(x[cols[0]]),\
#    'ss': stats.ss(x[cols[0]]-x[cols[0]].mean()), 'sobs': np.sum(x[cols[0]])}

        desc+= '''

    Percentiles
    -------------
    1  %%          %12.4g
    5  %%          %12.4g
    10 %%          %12.4g
    25 %%          %12.4g

    50 %%          %12.4g

    75 %%          %12.4g
    90 %%          %12.4g
    95 %%          %12.4g
    99 %%          %12.4g
    ''' % tuple([stats.scoreatpercentile(x,per) for per in (1,5,10,25,
                50,75,90,95,99)])
        t,p_t=stats.ttest_1samp(x,0)
        M,p_M=sign_test(x)
        S,p_S=stats.wilcoxon(np.squeeze(x))

        desc+= '''

    Tests of Location (H0: Mu0=0)
    -----------------------------
    Test                Statistic       Two-tailed probability
    -----------------+-----------------------------------------
    Student's t      |  t %7.5f   Pr > |t|   <%.4f
    Sign             |  M %8.2f   Pr >= |M|  <%.4f
    Signed Rank      |  S %8.2f   Pr >= |S|  <%.4f

    ''' % (t,p_t,M,p_M,S,p_S)
# Should this be part of a 'descstats'
# in any event these should be split up, so that they can be called
# individually and only returned together if someone calls summary
# or something of the sort

    elif x.shape[1] > 1:
        desc ='''
    Var. Name   |     Obs.        Mean    Std. Dev.           Range
    ------------+--------------------------------------------------------'''+\
            os.linesep

# for recarrays with columns passed as names
#        if isinstance(cols[0],str):
#            for var in cols:
#                desc += "%(name)15s %(obs)9i %(mean)12.4g %(stddev)12.4g \
#%(range)20s" %  {'name': var, 'obs': len(x[var]), 'mean': x[var].mean(),
#        'stddev': x[var].std(), 'range': '('+str(x[var].min())+', '\
#                +str(x[var].max())+')'+os.linesep}
#        else:
        for var in range(x.shape[1]):
                desc += "%(name)15s %(obs)9i %(mean)12.4g %(stddev)12.4g \
%(range)20s" % {'name': var, 'obs': len(x[:,var]), 'mean': x[:,var].mean(),
                'stddev': x[:,var].std(), 'range': '('+str(x[:,var].min())+', '+\
                str(x[:,var].max())+')'+os.linesep}
    else:
        raise ValueError("data not understood")

    return desc
Example #47
0
    'long_short'] = ewret_transposed.winners - ewret_transposed.losers

# Compute Long-Short Portfolio Cumulative Returns
ewret_transposed['cumret_winners'] = (1 +
                                      ewret_transposed.winners).cumprod() - 1
ewret_transposed['cumret_losers'] = (1 + ewret_transposed.losers).cumprod() - 1
ewret_transposed['cumret_long_short'] = (
    1 + ewret_transposed.long_short).cumprod() - 1

# Portfolio Summary #
# Mean #
mom_mean = ewret_transposed[['winners', 'losers',
                             'long_short']].mean().to_frame()
mom_mean = mom_mean.rename(columns={0: 'mean'}).reset_index()
# T-Value and P-Value #
t_losers = pd.Series(stats.ttest_1samp(ewret_transposed['losers'],
                                       0.0)).to_frame().T
t_winners = pd.Series(stats.ttest_1samp(ewret_transposed['winners'],
                                        0.0)).to_frame().T
t_long_short = pd.Series(stats.ttest_1samp(ewret_transposed['long_short'],
                                           0.0)).to_frame().T

t_losers['momr'] = 'losers'
t_winners['momr'] = 'winners'
t_long_short['momr'] = 'long_short'

t_output = pd.concat([t_winners, t_losers, t_long_short]).rename(columns={
    0: 't-stat',
    1: 'p-value'
})

# Combine mean, t and p and format output
Example #48
0
def t_statistic(df):
    stat, p_value = stats.ttest_1samp(
        a=df[df['Neighborhood'] == 'OldTown']['GrLivArea'],
        popmean=df['GrLivArea'].mean())
    test_result = stats.norm.ppf(.90)
    return p_value, p_value > test_result
if my_test.find('i') >= 0:
    pvals = []
    nfeats = brain.shape[1]
    nsubjs = brain.shape[0]
    for i in range(nfeats):
        vec = brain[:, i]
        keep = ~np.isnan(vec)
        # 2-tailed p-value by default
        pvals.append(stats.pearsonr(sx[keep], vec[keep])[1])
    pvals = np.array(pvals)
elif my_test.find('VS') >= 0:
    nsubjs = brain1.shape[0] + brain2.shape[0]
    pvals = stats.ttest_ind(brain1, brain2, axis=0, equal_var=True)[1]
else:
    nsubjs = brain.shape[0]
    pvals = stats.ttest_1samp(brain, popmean=0, axis=0)[1]

idx = ~np.isnan(pvals)
for a in alphas:
    reject_fdr, pval_fdr = mne.stats.fdr_correction(pvals[idx],
                                                    alpha=a,
                                                    method='indep')
    num_good = np.sum(reject_fdr)
    if num_good > 0:
        print '\n\nGood voxels at %.1e: %d\n\n' % (a, num_good)
        # if we have any good voxels left, put them in their original positions
        tvals = -stats.distributions.t.ppf(pvals / 2, nsubjs - 1)
        tvals[~reject_fdr] = 0
        tvals[~idx] = 0
        # make .nii with p-values
        fname = data_dir + my_test + '_FDR_a%.2ft%.2f' % (a, thresh)
Example #50
0
def ttest_1samp(X):
    """Returns T-values
    """
    return stats.ttest_1samp(X, 0)[0]
Example #51
0
import numpy
from scipy import stats
import math
#   1.  Generate 100 normally distributed random variable with mean=0.5
#       and variance=9.

n = 100
sample = numpy.random.normal(loc=0.5, scale=3, size=n)

#   2.  Compute the sample mean and sample variance.

x_bar = numpy.mean(sample)
s_squared = numpy.var(sample)

#   3.  Compute the test statistics.

t_val = (x_bar - 0) / math.sqrt(s_squared / n)

#   4.  Compute the p-value.
print 2 * (1 - stats.t.cdf(t_val, n - 1, q=True))
#p_value = pt(t, n-1, lower.tail = FALSE)*2

#   5.  Use the existing function or procedure to test the hypothesis.
#       Compare the two results.

results = stats.ttest_1samp(sample, popmean=0)
print results
            pl.plot(x, y, color=plotColors[colorCounter], label=regressorName)

            downSampledY = [y[index] for index in range(0, len(y), 40)]
            downSampledX = [x[index] for index in range(0, len(y), 40)]
            downSampledErr = [stErrs[index] for index in range(0, len(y), 40)]

            pl.errorbar(downSampledX,
                        downSampledY,
                        yerr=downSampledErr,
                        color=plotColors[colorCounter],
                        ls='none')

            tTestpValsVs0 = [
                ttest_1samp([
                    allPerObsData[glmIndex][regressorIndex][obsIndex]
                    [timePointIndex] for obsIndex in range(
                        len(allPerObsData[glmIndex][regressorIndex]))
                ], 0)[1] for timePointIndex in range(0, len(y), 40)
            ]

            xForSignificantOnes = []
            yForSignificantOnes = []
            for candidateIndex in range(len(downSampledY)):
                if tTestpValsVs0[candidateIndex] < .01:
                    xForSignificantOnes = xForSignificantOnes + [
                        downSampledX[candidateIndex]
                    ]
                    yForSignificantOnes = yForSignificantOnes + [
                        downSampledY[candidateIndex]
                    ]
Example #53
0
def bootstrap(x1, x2, paired=True, statfunction=None, smoothboot=False,
	alpha_level=0.05, reps=5000):
    '''
    Computes summary statistics and booststrapped confidence interval for
    paired data.

    Keywords:
    x1, x2: Paired 1D arrays

    paired: boolean, default True
        Whether x1 and x2 are paired samples

    statfunction: function
        Summary statistic to call on data. Default is np.mean

    alpha_level: float, default 0.05
        alpha = 0.05 gives 95 percent confidence interval

    reps: int, default = 5000
        number of bootstrap replicates

    Returns:

    dictionary of statistics






    '''
	
    # Imports
    import numpy as np
    import pandas as pd
    import seaborn as sns
    from scipy.stats import norm
    from numpy.random import randint
    from scipy.stats import ttest_1samp, ttest_ind, ttest_rel
    from scipy.stats import mannwhitneyu, wilcoxon, norm
    import warnings


	

    # Turn to pandas series.
    x1 = pd.Series(x1).dropna()
    diff = False

    # Initialise statfunction
    if statfunction == None:
        statfunction = np.mean


    # Compute two-sided alphas.
    if alpha_level > 1. or alpha_level < 0.:
        raise ValueError("alpha_level must be between 0 and 1.")
    alphas = np.array([alpha_level/2., 1-alpha_level/2.])

    
    sns_bootstrap_kwargs = {'func': statfunction,
    
        'n_boot': reps,
    
        'smooth': smoothboot}

    if paired:
        # check x2 is not None:
        #if x2 is None:
            #raise ValueError('Please specify x2.')
        x2 = pd.Series(x2).dropna()
        if len(x1) != len(x2):
            raise ValueError('x1 and x2 are not the same length.')

    if (x2 is None) or (paired is True):
        if x2 is None:
            tx = x1
            paired = False
            ttest_single = ttest_1samp(x1, 0)[1]
            ttest_2_ind = 'NIL'
            ttest_2_paired = 'NIL'
            wilcoxonresult = 'NIL'

        elif paired is True:
            diff = True
            tx = x2 - x1
            ttest_single = 'NIL'
            ttest_2_ind = 'NIL'
            ttest_2_paired = ttest_rel(x1, x2)[1]
            wilcoxonresult = wilcoxon(x1, x2)[1]
        mannwhitneyresult = 'NIL'

        # Turns data into array, then tuple.
        tdata = (tx,)


        # The value of the statistic function applied
        # just to the actual data.
        summ_stat = statfunction(*tdata)
        statarray = sns.algorithms.bootstrap(tx, **sns_bootstrap_kwargs)
        statarray.sort()


        # Get Percentile indices
        pct_low_high = np.round((reps-1) * alphas)
        pct_low_high = np.nan_to_num(pct_low_high).astype('int')

    # Get Bias-Corrected Accelerated indices convenience function invoked.
    bca_low_high = bca(tdata, alphas, statarray,
        statfunction, summ_stat, reps)


    # Warnings for unstable or extreme indices.
    for ind in [pct_low_high, bca_low_high]:
        if np.any(ind == 0) or np.any(ind == reps-1):
            warnings.warn("Some values used extremal samples;"
            " results are probably unstable.")
        elif np.any(ind<10) or np.any(ind>=reps-10):
            warnings.warn("Some values used top 10 low/high samples;"
            " results may be unstable.")

    effsize = np.mean(x2-x1)


    #summary = summ_stat
    # Calculates more statistics than it returns.
    # Function can be modified to return necessary statistics.
    is_paired = paired
    is_difference = diff
    statistic = str(statfunction)
    n_reps = reps
    ci = (1-alpha_level)*100
    stat_array = np.array(statarray)
    pct_ci_low = statarray[pct_low_high[0]]
    pct_ci_high = statarray[pct_low_high[1]]
    pct_low_high_indices = pct_low_high
    bca_ci_low = statarray[bca_low_high[0]]
    bca_ci_high = statarray[bca_low_high[1]]
    bca_low_high_indices = bca_low_high
    pvalue_1samp_ttest = ttest_single
    pvalue_2samp_ind_ttest = ttest_2_ind
    pvalue_2samp_paired_ttest = ttest_2_paired
    pvalue_wilcoxon = wilcoxonresult
    pvalue_mann_whitney = mannwhitneyresult
    effect_size = effsize

    stat_dict = {'ci' : ci, 'pct_ci_low' : pct_ci_low, 'pct_ci_high' : pct_ci_high, 'pct_low_high_indices' : pct_low_high_indices, 
    'bca_ci_low' : bca_ci_low, 'bca_ci_high' : bca_ci_high, 'bca_low_high_indices' : bca_low_high, 'pvalue_1samp_ttest' : pvalue_1samp_ttest, 
    'pvalue_2samp_ind_ttest' : pvalue_2samp_ind_ttest, 'pvalue_2samp_paired_ttest' : pvalue_2samp_paired_ttest, 
    'pvalue_wilcoxon' : pvalue_wilcoxon, 'pvalue_mann_whitney' : pvalue_mann_whitney, 'effect_size' : effsize}

    return stat_dict
Example #54
0
# Mathieu Blondel, February 2012
# License: BSD 3 clause

# Port to Python of examples in chapter 5 of
# "Introductory Statistics with R" by Peter Dalgaard

import numpy as np
from scipy.stats import ttest_1samp, wilcoxon, ttest_ind, mannwhitneyu

# daily intake of energy in kJ for 11 women
daily_intake = np.array(
    [5260, 5470, 5640, 6180, 6390, 6515, 6805, 7515, 7515, 8230, 8770])

# one sample t-test
# null hypothesis: expected value = 7725
t_statistic, p_value = ttest_1samp(daily_intake, 7725)

# p_value < 0.05 => alternative hypothesis:
# data deviate significantly from the hypothesis that the mean
# is 7725 at the 5% level of significance
print "one-sample t-test", p_value

# one sample wilcoxon-test
z_statistic, p_value = wilcoxon(daily_intake - 7725)
print "one-sample wilcoxon-test", p_value

energ = np.array([
    # energy expenditure in mJ and stature (0=obese, 1=lean)
    [9.21, 0],
    [7.53, 1],
    [7.48, 1],
game.history_quantity

from matplotlib import pyplot as plt

temp = np.array(game.history_quantity.tolist())
plt.plot(game.history_quantity)
plt.title("Quantity")
plt.show()

plt.plot(game.history_profits)
plt.title("profits")
plt.show()

if len(players) > 1:
    # total quantity porduced
    plt.plot(np.sum(game.history_quantity, 1))
    plt.title("total quantity produced")
    plt.show()

    # total quantity porduced
    plt.plot(np.sum(game.history_profits, 1))
    plt.title("aggregated profits")
    plt.show()

np.mean(np.sum(game.history_quantity, 1))
from scipy.stats import ttest_1samp
qty = np.sum(game.history_quantity, 1)
qty = qty[-500:]
print(np.mean(qty))
print(ttest_1samp(qty, popmean=game.get_competitive_equilibrium_production()))
Example #56
0
from scipy import stats
one_sample_data = [
    177.3, 182.7, 169.6, 176.3, 180.3, 179.4, 178.5, 177.2, 181.8, 176.5
]

one_sample = stats.ttest_1samp(one_sample_data, 175.3)

print "The t-statistic is %.3f and the p-value is %.3f." % one_sample
Example #57
0
    plt.errorbar(
        np.arange(3) + 0.05 * (-1 + exp_j), n_pows.mean(1), n_pows.std(1))
    plt.xticks([0, 1, 2], ['background', 'random border', 'sin border'])
    plt.xlim(-0.5, 2.5)
    plt.ylabel('power / background power')
    plt.title(
        'Relative mean power in {} with rejections: ICA_artifacts and CSP_alpha'
        .format(channel))  # with rejections: ICA_artifacts and CSP_alpha
    print(n_pows.mean(1))
    print(n_pows.std(1))
    all_pows.append(n_pows)

all_pows = np.array(all_pows)

print(all_pows[:, 0])
print(ttest_1samp(all_pows[:, 0].flatten(), 1))
print(ttest_1samp(all_pows[:, 1].flatten(), 1))
print(ttest_1samp(all_pows[:, 2].flatten(), 1))
plt.legend(experiments)
plt.savefig(channel + '.png', dpi=200)

plt.show()

plt.figure()
plt.hist(all_pows[:, 0].flatten(), bins=30)

plt.show()

plt.figure()
plt.hist(all_pows[:, 1].flatten(), bins=30)
plt.show()
Example #58
0
    103, 111, 104, 111, 89, 78, 100, 89, 85, 88
],
                    [
                        137, 105, 133, 108, 115, 170, 103, 145, 78, 107, 84,
                        148, 147, 87, 166, 146, 123, 135, 112, 93, 76, 116, 78,
                        101, 123
                    ]])

dataDiff = data[1, :] - data[0, :]
dataDiff.mean(), dataDiff.std()

plt.rcParams['figure.figsize'] = (15.0, 5.0)
plt.hist(dataDiff)
plt.show()

t_stat, p_value = ttest_1samp(dataDiff, 0.0)
print(p_value / 2.0)

mean, std = norm.fit(dataDiff)
print(mean, std)
print('#', 50 * "-")
# -----------------------
from scipy.stats import gaussian_kde

plt.hist(dataDiff, density=1)
x = numpy.linspace(dataDiff.min(), dataDiff.max(), 1000)
pdf = norm.pdf(x, mean, std)
plt.plot(x, pdf)

pdf = gaussian_kde(dataDiff)
pdf = pdf.evaluate(x)
Example #59
0
dataset = loadtxt('data.csv', delimiter=',')
means = np.mean(dataset, axis=0)
stds = np.std(dataset, axis=0)
N = dataset.shape[0]

# If the hash bits are truly random, we should expect that
# 50% of the time each bit is 0, and 50% of the time each bit is 1.
# The variance of each bit should be 1/4 (i.e. standard deviation = 1/2).
#
# We perform a t-test for each bit to determine if the bit's sample
# distribution is equal to the expected distribution (mean=0.5)...

print('********** t-test **********')

num_improbable = 0

for variable in range(dataset.shape[1]):
  # null hypothesis = "mean is 0.5"
  # alternative hypothesis = "mean is not 0.5"
  # if p < 0.05 we reject the null hypothesis --> "mean is not 0.5"
  stat, p = stats.ttest_1samp(dataset[:, variable], 0.5)
  if abs(stat) > 2 and p < 0.05:
    print('Mean of bit %d is probably not 0.5' % variable)
    num_improbable += 1

print('num_improbable = %d' % num_improbable)
assert num_improbable > 0, 'I am crazy, cannot derive a relationship between bits in a sha256 hash'
# But if you re-generate the dataset, the bits whose mean is not 0.5
# are not consistent across randomly-generated data sets... :(
Example #60
0
    print("Wiederholbarkeit ausreichend")
else:
    print("Wiederholbarkeit ist nicht ausreichend")
c_gk = (0.1 * Y_TOLERANCE - np.abs(y_deviation)) / 3 / np.std(y_repeat_test,
                                                              ddof=1)
print("")
print("C_gk = ", round(c_gk, 3))
if c_gk >= 1.33:
    print("Wiederholbarkeit und sytematische Abweichung ausreichend")
elif c_g >= 1.33:
    print("Systematische Abweichung zu groß")
else:
    print("Auflösung und systematische Abweichung nicht ausreichend")

# Hypothesistest with H0: y_repeat_test = Y_REPEAT_REFERENCE
hypo_test = stats.ttest_1samp(y_repeat_test, Y_REPEAT_REFERENCE)
print("")
print("Hypothesentest auf Abweichung mit p-value = ",
      round(float(hypo_test[1]), 4))
if hypo_test[1] <= 0.05:
    print("Abweichung signifikant")
else:
    print("Abweichung nicht signifikant")

# Confidence bounds für y_repeat_test
GAMMA = 0.95
c1 = stats.t.ppf((1 - GAMMA) / 2, y_repeat_len - 1)
c2 = stats.t.ppf((1 + GAMMA) / 2, y_repeat_len - 1)
y_repeat_min = np.mean(y_repeat_test) + c1*np.std(y_repeat_test, ddof=1)\
    / np.sqrt(y_repeat_len)
y_repeat_max = np.mean(y_repeat_test) + c2*np.std(y_repeat_test, ddof=1)\