Beispiel #1
0
def box_plot(df, val, factors=None, where=None,
        fname=None, output_dir='', quality='medium'):
    """
    Makes a box plot

    args:
       df:
          a pyvttbl.DataFrame object
          
       val:
          the label of the dependent variable

    kwds:
       factors:
          a list of factors to include in boxplot
          
       where:
          a string, list of strings, or list of tuples
          applied to the DataFrame before plotting
          
       fname:
          output file name
          
       quality:
          {'low' | 'medium' | 'high'} specifies image file dpi
    """

    if factors == None:
        factors = []

    if where == None:
        where = []

    # check to see if there is any data in the table
    if df == {}:
        raise Exception('Table must have data to print data')
    
    # check to see if data columns have equal lengths
    if not df._are_col_lengths_equal():
        raise Exception('columns have unequal lengths')

    # check the supplied arguments
    if val not in df.keys():
        raise KeyError(val)

    if not hasattr(factors, '__iter__'):
        raise TypeError( "'%s' object is not iterable"
                         % type(factors).__name__)
    
    for k in factors:
        if k not in df.keys():
            raise KeyError(k)
        
    # check for duplicate names
    dup = Counter([val]+factors)
    del dup[None]
    if not all([count==1 for count in dup.values()]):
        raise Exception('duplicate labels specified as plot parameters')

    # check fname
    if not isinstance(fname, _strobj) and fname != None:
        raise TypeError('fname must be None or string')

    if isinstance(fname, _strobj):
        if not (fname.lower().endswith('.png') or \
                fname.lower().endswith('.svg')):
            raise Exception('fname must end with .png or .svg')

    test = {}

    if factors == []:
        d = df.select_col(val, where=where)            
        fig = pylab.figure()
        pylab.boxplot(np.array(d))
        xticks = pylab.xticks()[0]
        xlabels = [val]
        pylab.xticks(xticks, xlabels)

        test['d'] = d
        test['val'] = val

    else:
        D = df.pivot(val, rows=factors,
                       where=where,
                       aggregate='tolist')

        fig = pylab.figure(figsize=(6*len(factors),6))
        fig.subplots_adjust(left=.05, right=.97, bottom=0.24)
        pylab.boxplot([np.array(_flatten(d)) for d in D])
        xticks = pylab.xticks()[0]
        xlabels = ['\n'.join('%s = %s'%fc for fc in c) for c in D.rnames]
        pylab.xticks(xticks, xlabels,
                     rotation=35,
                     verticalalignment='top')

        test['d'] = [np.array(_flatten(d)) for d in D]
        test['xlabels'] = xlabels

    maintitle = '%s'%val

    if factors != []:
        maintitle += ' by '
        maintitle += ' * '.join(factors)
        
    fig.text(0.5, 0.95, maintitle,
             horizontalalignment='center',
             verticalalignment='top')
    
    test['maintitle'] = maintitle
        
    if fname == None:
        fname = 'box(%s'%val
        if factors != []:
            fname += '~' + '_X_'.join([str(f) for f in factors])
        fname += ').png'

    fname = os.path.join(output_dir, fname)
    
    test['fname'] = fname
    
    # save figure
    if quality == 'low' or fname.endswith('.svg'):
        pylab.savefig(fname)
        
    elif quality == 'medium':
        pylab.savefig(fname, dpi=200)
        
    elif quality == 'high':
        pylab.savefig(fname, dpi=300)
        
    else:
        pylab.savefig(fname)

    pylab.close()

    if df.TESTMODE:
        return test
Beispiel #2
0
def box_plot(df,
             val,
             factors=None,
             where=None,
             fname=None,
             output_dir='',
             quality='medium'):
    """
    Makes a box plot

    args:
       df:
          a pyvttbl.DataFrame object
          
       val:
          the label of the dependent variable

    kwds:
       factors:
          a list of factors to include in boxplot
          
       where:
          a string, list of strings, or list of tuples
          applied to the DataFrame before plotting
          
       fname:
          output file name
          
       quality:
          {'low' | 'medium' | 'high'} specifies image file dpi
    """

    if factors == None:
        factors = []

    if where == None:
        where = []

    # check to see if there is any data in the table
    if df == {}:
        raise Exception('Table must have data to print data')

    # check to see if data columns have equal lengths
    if not df._are_col_lengths_equal():
        raise Exception('columns have unequal lengths')

    # check the supplied arguments
    if val not in list(df.keys()):
        raise KeyError(val)

    if not hasattr(factors, '__iter__'):
        raise TypeError("'%s' object is not iterable" % type(factors).__name__)

    for k in factors:
        if k not in list(df.keys()):
            raise KeyError(k)

    # check for duplicate names
    dup = Counter([val] + factors)
    del dup[None]
    if not all([count == 1 for count in list(dup.values())]):
        raise Exception('duplicate labels specified as plot parameters')

    # check fname
    if not isinstance(fname, _strobj) and fname != None:
        raise TypeError('fname must be None or string')

    if isinstance(fname, _strobj):
        if not (fname.lower().endswith('.png') or \
                fname.lower().endswith('.svg')):
            raise Exception('fname must end with .png or .svg')

    test = {}

    if factors == []:
        d = df.select_col(val, where=where)
        fig = pylab.figure()
        pylab.boxplot(np.array(d))
        xticks = pylab.xticks()[0]
        xlabels = [val]
        pylab.xticks(xticks, xlabels)

        test['d'] = d
        test['val'] = val

    else:
        D = df.pivot(val, rows=factors, where=where, aggregate='tolist')

        fig = pylab.figure(figsize=(6 * len(factors), 6))
        fig.subplots_adjust(left=.05, right=.97, bottom=0.24)
        pylab.boxplot([np.array(_flatten(d)) for d in D])
        xticks = pylab.xticks()[0]
        xlabels = ['\n'.join('%s = %s' % fc for fc in c) for c in D.rnames]
        pylab.xticks(xticks, xlabels, rotation=35, verticalalignment='top')

        test['d'] = [np.array(_flatten(d)) for d in D]
        test['xlabels'] = xlabels

    maintitle = '%s' % val

    if factors != []:
        maintitle += ' by '
        maintitle += ' * '.join(factors)

    fig.text(0.5,
             0.95,
             maintitle,
             horizontalalignment='center',
             verticalalignment='top')

    test['maintitle'] = maintitle

    if fname == None:
        fname = 'box(%s' % val
        if factors != []:
            fname += '~' + '_X_'.join([str(f) for f in factors])
        fname += ').png'

    fname = os.path.join(output_dir, fname)

    test['fname'] = fname

    # save figure
    if quality == 'low' or fname.endswith('.svg'):
        pylab.savefig(fname)

    elif quality == 'medium':
        pylab.savefig(fname, dpi=200)

    elif quality == 'high':
        pylab.savefig(fname, dpi=300)

    else:
        pylab.savefig(fname)

    pylab.close()

    if df.TESTMODE:
        return test
Beispiel #3
0
    def run(self,
            A,
            B=None,
            pop_mean=None,
            paired=False,
            equal_variance=True,
            alpha=0.05,
            aname=None,
            bname=None):
        """
        Compares the data in A to the data in B. If A or B are
        multidimensional they are flattened before testing.

        When paired is True, the equal_variance parameter has
        no effect, an exception is raised if A and B are not
        of equal length.
          t = \frac{\overline{X}_D - \mu_0}{s_D/\sqrt{n}}
          where:
            \overline{X}_D is the difference of the averages
            s_D is the standard deviation of the differences

          \mathrm{d.f.} = n_1 - 1

        When paired is False and equal_variance is True.
          t = \frac{\bar {X}_1 - \bar{X}_2}{S_{X_1X_2} \cdot \sqrt{\frac{1}{n_1}+\frac{1}{n_2}}}
          where:
          {S_{X_1X_2} is the pooled standard deviation
          computed as:
            S_{X_1X_2} = \sqrt{\frac{(n_1-1)S_{X_1}^2+(n_2-1)S_{X_2}^2}{n_1+n_2-2}}

          \mathrm{d.f.} = n_1 + n_2 - 2
          
        When paired is False and equal_variance is False.
          t = {\overline{X}_1 - \overline{X}_2 \over s_{\overline{X}_1 - \overline{X}_2}}
          where:
            s_{\overline{X}_1 - \overline{X}_2} = \sqrt{{s_1^2 \over n_1} + {s_2^2  \over n_2}}
            where:
            s_1^2 and s_2^2 are the unbiased variance estimates

          \mathrm{d.f.} = \frac{(s_1^2/n_1 + s_2^2/n_2)^2}{(s_1^2/n_1)^2/(n_1-1) + (s_2^2/n_2)^2/(n_2-1)}
        """

        A = _flatten(list(copy(A)))
        ##        try:
        ##            A = _flatten(list(copy(A)))
        ##        except:
        ##            raise TypeError('A must be a list-like object')

        try:
            if B != None:
                B = _flatten(list(copy(B)))
        except:
            raise TypeError('B must be a list-like object')

        if aname == None:
            self.aname = 'A'
        else:
            self.aname = aname

        if bname == None:
            self.bname = 'B'
        else:
            self.bname = bname

        self.A = A
        self.B = B
        self.paired = paired
        self.equal_variance = equal_variance
        self.alpha = alpha

        if B == None:
            t, prob2, n, df, mu, v = _stats.lttest_1samp(A, pop_mean)

            self.type = 't-Test: One Sample for means'
            self['t'] = t
            self['p2tail'] = prob2
            self['p1tail'] = prob2 / 2.
            self['n'] = n
            self['df'] = df
            self['mu'] = mu
            self['pop_mean'] = pop_mean
            self['var'] = v
            self['tc2tail'] = scipy.stats.t.ppf((1. - alpha), df)
            self['tc1tail'] = scipy.stats.t.ppf((1. - alpha / 2.), df)

            # post-hoc power analysis
            self['cohen_d'] = abs((pop_mean - mu) / math.sqrt(v))
            self['delta'] = math.sqrt(n) * self['cohen_d']
            self['power1tail'] = 1. - nctcdf(self['tc2tail'], df,
                                             self['delta'])
            self['power2tail'] = 1. - nctcdf(self['tc1tail'], df,
                                             self['delta'])

        elif paired == True:
            if len(A) - len(B) != 0:
                raise Exception('A and B must have equal lengths '
                                'for paired comparisons')

            t, prob2, n, df, mu1, mu2, v1, v2 = _stats.ttest_rel(A, B)
            r, rprob2 = _stats.pearsonr(A, B)

            self.type = 't-Test: Paired Two Sample for means'
            self['t'] = t
            self['p2tail'] = prob2
            self['p1tail'] = prob2 / 2.
            self['n1'] = n
            self['n2'] = n
            self['r'] = r
            self['df'] = df
            self['mu1'] = mu1
            self['mu2'] = mu2
            self['var1'] = v1
            self['var2'] = v2
            self['tc2tail'] = scipy.stats.t.ppf((1. - alpha), df)
            self['tc1tail'] = scipy.stats.t.ppf((1. - alpha / 2.), df)

            # post-hoc power analysis
            # http://www.psycho.uni-duesseldorf.de/abteilungen/aap/gpower3/download-and-register/Dokumente/GPower3-BRM-Paper.pdf
            sd1, sd2 = math.sqrt(v1), math.sqrt(v2)
            self['cohen_d'] = abs(mu1 - mu2) / math.sqrt(v1 + v2 -
                                                         2 * r * sd1 * sd2)
            self['delta'] = math.sqrt(n) * self['cohen_d']
            self['power1tail'] = 1. - nctcdf(self['tc2tail'], df,
                                             self['delta'])
            self['power2tail'] = 1. - nctcdf(self['tc1tail'], df,
                                             self['delta'])

        elif equal_variance:
            t, prob2, n1, n2, df, mu1, mu2, v1, v2, svar = _stats.ttest_ind(
                A, B)

            self.type = 't-Test: Two-Sample Assuming Equal Variances'
            self['t'] = t
            self['p2tail'] = prob2
            self['p1tail'] = prob2 / 2.
            self['n1'] = n1
            self['n2'] = n2
            self['df'] = df
            self['mu1'] = mu1
            self['mu2'] = mu2
            self['var1'] = v1
            self['var2'] = v2
            self['vpooled'] = svar
            self['tc2tail'] = scipy.stats.t.ppf((1. - alpha), df)
            self['tc1tail'] = scipy.stats.t.ppf((1. - alpha / 2.), df)

            # post-hoc power analysis
            # http://www.psycho.uni-duesseldorf.de/abteilungen/aap/gpower3/download-and-register/Dokumente/GPower3-BRM-Paper.pdf
            #
            # the pooled standard deviation is calculated as:
            #     sqrt((v1+v2)/2.)
            # although wikipedia suggests a more sophisticated estimate might be preferred:
            #     sqrt(((n1-1)*v1 + (n2-1)*v2)/(n1+n2))
            #
            # the biased estimate is used so that the results agree with G*power

            s = math.sqrt((v1 + v2) / 2.)
            self['cohen_d'] = abs(mu1 - mu2) / s
            self['delta'] = math.sqrt((n1 * n2) / (n1 + n2)) * self['cohen_d']
            self['power1tail'] = 1. - nctcdf(self['tc2tail'], df,
                                             self['delta'])
            self['power2tail'] = 1. - nctcdf(self['tc1tail'], df,
                                             self['delta'])

        else:
            t, prob2, n1, n2, df, mu1, mu2, v1, v2 = _stats.ttest_ind_uneq(
                A, B)

            self.type = 't-Test: Two-Sample Assuming Unequal Variances'
            self['t'] = t
            self['p2tail'] = prob2
            self['p1tail'] = prob2 / 2.
            self['n1'] = n1
            self['n2'] = n2
            self['df'] = df
            self['mu1'] = mu1
            self['mu2'] = mu2
            self['var1'] = v1
            self['var2'] = v2
            self['tc2tail'] = scipy.stats.t.ppf((1. - alpha), df)
            self['tc1tail'] = scipy.stats.t.ppf((1. - alpha / 2.), df)

            # post-hoc power analysis
            # http://www.psycho.uni-duesseldorf.de/abteilungen/aap/gpower3/download-and-register/Dokumente/GPower3-BRM-Paper.pdf
            #
            # the pooled standard deviation is calculated as:
            #     sqrt((v1+v2)/2.)
            # although wikipedia suggests a more sophisticated estimate might be preferred:
            #     sqrt(((n1-1)*v1 + (n2-1)*v2)/(n1+n2))
            #
            # the biased estimate is used so that the results agree with G*power

            s = math.sqrt((v1 + v2) / 2.)
            self['cohen_d'] = abs(mu1 - mu2) / s
            self['delta'] = math.sqrt((n1 * n2) / (n1 + n2)) * self['cohen_d']
            self['power1tail'] = 1. - nctcdf(self['tc2tail'], df,
                                             self['delta'])
            self['power2tail'] = 1. - nctcdf(self['tc1tail'], df,
                                             self['delta'])
Beispiel #4
0
    def run(self, A, B=None, pop_mean=None, paired=False, equal_variance=True,
                 alpha=0.05, aname=None, bname=None):
        """
        Compares the data in A to the data in B. If A or B are
        multidimensional they are flattened before testing.

        When paired is True, the equal_variance parameter has
        no effect, an exception is raised if A and B are not
        of equal length.
          t = \frac{\overline{X}_D - \mu_0}{s_D/\sqrt{n}}
          where:
            \overline{X}_D is the difference of the averages
            s_D is the standard deviation of the differences

          \mathrm{d.f.} = n_1 - 1

        When paired is False and equal_variance is True.
          t = \frac{\bar {X}_1 - \bar{X}_2}{S_{X_1X_2} \cdot \sqrt{\frac{1}{n_1}+\frac{1}{n_2}}}
          where:
          {S_{X_1X_2} is the pooled standard deviation
          computed as:
            S_{X_1X_2} = \sqrt{\frac{(n_1-1)S_{X_1}^2+(n_2-1)S_{X_2}^2}{n_1+n_2-2}}

          \mathrm{d.f.} = n_1 + n_2 - 2
          
        When paired is False and equal_variance is False.
          t = {\overline{X}_1 - \overline{X}_2 \over s_{\overline{X}_1 - \overline{X}_2}}
          where:
            s_{\overline{X}_1 - \overline{X}_2} = \sqrt{{s_1^2 \over n_1} + {s_2^2  \over n_2}}
            where:
            s_1^2 and s_2^2 are the unbiased variance estimates

          \mathrm{d.f.} = \frac{(s_1^2/n_1 + s_2^2/n_2)^2}{(s_1^2/n_1)^2/(n_1-1) + (s_2^2/n_2)^2/(n_2-1)}
        """

        A = _flatten(list(copy(A)))
##        try:
##            A = _flatten(list(copy(A)))
##        except:
##            raise TypeError('A must be a list-like object')
            
        try:
            if B != None:
                B = _flatten(list(copy(B)))
        except:
            raise TypeError('B must be a list-like object')

        if aname == None:
            self.aname = 'A'
        else:
            self.aname = aname

        if bname == None:
            self.bname = 'B'
        else:
            self.bname = bname
            
        self.A = A
        self.B = B
        self.paired = paired
        self.equal_variance = equal_variance
        self.alpha = alpha

        if B == None:
            t, prob2, n, df, mu, v = _stats.lttest_1samp(A, pop_mean)

            self.type = 't-Test: One Sample for means'
            self['t'] = t
            self['p2tail'] = prob2
            self['p1tail'] = prob2 / 2.
            self['n'] = n
            self['df'] = df
            self['mu'] = mu
            self['pop_mean'] = pop_mean
            self['var'] = v
            self['tc2tail'] = scipy.stats.t.ppf((1.-alpha),df)
            self['tc1tail'] = scipy.stats.t.ppf((1.-alpha/2.),df)

            # post-hoc power analysis
            self['cohen_d'] = abs( (pop_mean - mu) / math.sqrt(v) )
            self['delta'] = math.sqrt(n) *self['cohen_d']
            self['power1tail'] = 1. - nctcdf(self['tc2tail'], df, self['delta'])
            self['power2tail'] = 1. - nctcdf(self['tc1tail'], df, self['delta'])
            
                
        elif paired == True:
            if len(A) - len(B) != 0:
                raise Exception('A and B must have equal lengths '
                                'for paired comparisons')
            
            t, prob2, n, df, mu1, mu2, v1, v2 = _stats.ttest_rel(A, B)
            r, rprob2 = _stats.pearsonr(A,B)
            
            self.type = 't-Test: Paired Two Sample for means'
            self['t'] = t
            self['p2tail'] = prob2
            self['p1tail'] = prob2 / 2.
            self['n1'] = n
            self['n2'] = n
            self['r'] = r
            self['df'] = df
            self['mu1'] = mu1
            self['mu2'] = mu2
            self['var1'] = v1
            self['var2'] = v2
            self['tc2tail'] = scipy.stats.t.ppf((1.-alpha),df)
            self['tc1tail'] = scipy.stats.t.ppf((1.-alpha/2.),df)

            # post-hoc power analysis
            # http://www.psycho.uni-duesseldorf.de/abteilungen/aap/gpower3/download-and-register/Dokumente/GPower3-BRM-Paper.pdf
            sd1,sd2 = math.sqrt(v1), math.sqrt(v2)
            self['cohen_d'] = abs(mu1 - mu2) / math.sqrt(v1 + v2 - 2*r*sd1*sd2)
            self['delta'] = math.sqrt(n) *self['cohen_d']
            self['power1tail'] = 1. - nctcdf(self['tc2tail'], df, self['delta'])
            self['power2tail'] = 1. - nctcdf(self['tc1tail'], df, self['delta'])
            
        elif equal_variance:
            t, prob2, n1, n2, df, mu1, mu2, v1, v2, svar = _stats.ttest_ind(A, B)

            self.type = 't-Test: Two-Sample Assuming Equal Variances'
            self['t'] = t
            self['p2tail'] = prob2
            self['p1tail'] = prob2 / 2.
            self['n1'] = n1
            self['n2'] = n2
            self['df'] = df
            self['mu1'] = mu1
            self['mu2'] = mu2
            self['var1'] = v1
            self['var2'] = v2
            self['vpooled'] = svar
            self['tc2tail'] = scipy.stats.t.ppf((1.-alpha),df)
            self['tc1tail'] = scipy.stats.t.ppf((1.-alpha/2.),df)


            # post-hoc power analysis
            # http://www.psycho.uni-duesseldorf.de/abteilungen/aap/gpower3/download-and-register/Dokumente/GPower3-BRM-Paper.pdf
            # 
            # the pooled standard deviation is calculated as:
            #     sqrt((v1+v2)/2.)
            # although wikipedia suggests a more sophisticated estimate might be preferred:
            #     sqrt(((n1-1)*v1 + (n2-1)*v2)/(n1+n2))
            #
            # the biased estimate is used so that the results agree with G*power
            
            s = math.sqrt((v1+v2)/2.)
            self['cohen_d'] = abs(mu1 - mu2) / s
            self['delta'] = math.sqrt((n1*n2)/(n1+n2)) *self['cohen_d']
            self['power1tail'] = 1. - nctcdf(self['tc2tail'], df, self['delta'])
            self['power2tail'] = 1. - nctcdf(self['tc1tail'], df, self['delta'])
            
        else:            
            t, prob2, n1, n2, df, mu1, mu2, v1, v2 = _stats.ttest_ind_uneq(A, B)
        
            self.type = 't-Test: Two-Sample Assuming Unequal Variances'
            self['t'] = t
            self['p2tail'] = prob2
            self['p1tail'] = prob2 / 2.
            self['n1'] = n1
            self['n2'] = n2
            self['df'] = df
            self['mu1'] = mu1
            self['mu2'] = mu2
            self['var1'] = v1
            self['var2'] = v2
            self['tc2tail'] = scipy.stats.t.ppf((1.-alpha),df)
            self['tc1tail'] = scipy.stats.t.ppf((1.-alpha/2.),df)          

            # post-hoc power analysis
            # http://www.psycho.uni-duesseldorf.de/abteilungen/aap/gpower3/download-and-register/Dokumente/GPower3-BRM-Paper.pdf
            # 
            # the pooled standard deviation is calculated as:
            #     sqrt((v1+v2)/2.)
            # although wikipedia suggests a more sophisticated estimate might be preferred:
            #     sqrt(((n1-1)*v1 + (n2-1)*v2)/(n1+n2))
            #
            # the biased estimate is used so that the results agree with G*power
            
            s = math.sqrt((v1+v2)/2.)
            self['cohen_d'] = abs(mu1 - mu2) / s
            self['delta'] = math.sqrt((n1*n2)/(n1+n2)) *self['cohen_d']
            self['power1tail'] = 1. - nctcdf(self['tc2tail'], df, self['delta'])
            self['power2tail'] = 1. - nctcdf(self['tc1tail'], df, self['delta'])