Exemple #1
0
    def _call(self, dataset, labels=None):
        # This code is based on SciPy's stats.f_oneway()
        # Copyright (c) Gary Strangman.  All rights reserved
        # License: BSD
        #
        # However, it got tweaked and optimized to better fit into PyMVPA.

        # number of groups
        if labels is None:
            labels = dataset.targets

        ul = np.unique(labels)

        na = len(ul)
        bign = float(dataset.nsamples)
        alldata = dataset.samples

        # total squares of sums
        sostot = np.sum(alldata, axis=0)
        sostot *= sostot
        sostot /= bign

        # total sum of squares
        sstot = np.sum(alldata * alldata, axis=0) - sostot

        # between group sum of squares
        ssbn = 0
        for l in ul:
            # all samples for the respective label
            d = alldata[labels == l]
            sos = np.sum(d, axis=0)
            sos *= sos
            ssbn += sos / float(len(d))

        ssbn -= sostot
        # within
        sswn = sstot - ssbn

        # degrees of freedom
        dfbn = na-1
        dfwn = bign - na

        # mean sums of squares
        msb = ssbn / float(dfbn)
        msw = sswn / float(dfwn)
        f = msb / msw
        # assure no NaNs -- otherwise it leads instead of
        # sane unittest failure (check of NaNs) to crazy
        #   File "mtrand.pyx", line 1661, in mtrand.shuffle
        #  TypeError: object of type 'numpy.int64' has no len()
        # without any sane backtrace
        f[np.isnan(f)] = 0

        if externals.exists('scipy'):
            from scipy.stats import fprob
            return Dataset(f[np.newaxis], fa={'fprob': fprob(dfbn, dfwn, f)})
        else:
            return Dataset(f[np.newaxis])
Exemple #2
0
def if_classif(X_y, n_features):
    """Compute the Anova F-value for the provided sample

    Parameters
    ----------
    X_y : Tuples of (X, y) with 
          X {array-like, sparse matrix} shape = [n_samples, n_features]
          The set of regressors that will tested sequentially
          y array of shape(n_samples)
          The data matrix

    Returns
    -------
    F : array, shape = [n_features,]
        The set of F values
    pval : array, shape = [n_features,]
        The set of p-values
    """
    
    n_samples = 0
    n_samples_per_class = defaultdict(lambda: 0)
    
    sums_args_d = defaultdict(lambda: np.zeros(shape=(n_features))) 
    ss_alldata = np.zeros(shape=(n_features))
    
    for X, y in X_y:
        if(n_samples % 100) == 0:
            logger.info("Processing doc #%d..." % n_samples)
            
        n_samples += 1
        n_samples_per_class[y] += 1
        
        ss_alldata[:] += X[:]**2
        sums_args_d[y][:] += X[:]
        
    n_classes = len(sums_args_d.keys())
    
    #Convert dictionary to numpy array
    sums_args = np.array(list(row for row in sums_args_d.itervalues()))
    
    square_of_sums_alldata = safe_sqr(reduce(lambda x, y: x + y, sums_args))
    square_of_sums_args = [safe_sqr(s) for s in sums_args]
    sstot = ss_alldata - square_of_sums_alldata / float(n_samples)
    ssbn = 0.
    for k, y in enumerate(n_samples_per_class.keys()):
        ssbn += square_of_sums_args[k] / n_samples_per_class[y]
    ssbn -= square_of_sums_alldata / float(n_samples)
    sswn = sstot - ssbn
    dfbn = n_classes - 1
    dfwn = n_samples - n_classes
    msb = ssbn / float(dfbn)
    msw = sswn / float(dfwn)
    f = msb / msw
    # flatten matrix to vector in sparse case
    f = np.asarray(f).ravel()
    prob = stats.fprob(dfbn, dfwn, f)
    return f, prob
Exemple #3
0
    def _call(self, dataset):
        # This code is based on SciPy's stats.f_oneway()
        # Copyright (c) Gary Strangman.  All rights reserved
        # License: BSD
        #
        # However, it got tweaked and optimized to better fit into PyMVPA.

        # number of groups
        targets_sa = dataset.sa[self.get_space()]
        labels = targets_sa.value
        ul = targets_sa.unique

        na = len(ul)
        bign = float(dataset.nsamples)
        alldata = dataset.samples

        # total squares of sums
        sostot = np.sum(alldata, axis=0)
        sostot *= sostot
        sostot /= bign

        # total sum of squares
        sstot = np.sum(alldata * alldata, axis=0) - sostot

        # between group sum of squares
        ssbn = 0
        for l in ul:
            # all samples for the respective label
            d = alldata[labels == l]
            sos = np.sum(d, axis=0)
            sos *= sos
            ssbn += sos / float(len(d))

        ssbn -= sostot
        # within
        sswn = sstot - ssbn

        # degrees of freedom
        dfbn = na - 1
        dfwn = bign - na

        # mean sums of squares
        msb = ssbn / float(dfbn)
        msw = sswn / float(dfwn)
        f = msb / msw
        # assure no NaNs -- otherwise it leads instead of
        # sane unittest failure (check of NaNs) to crazy
        #   File "mtrand.pyx", line 1661, in mtrand.shuffle
        #  TypeError: object of type 'numpy.int64' has no len()
        # without any sane backtrace
        f[np.isnan(f)] = 0

        if externals.exists('scipy'):
            from scipy.stats import fprob
            return Dataset(f[np.newaxis], fa={'fprob': fprob(dfbn, dfwn, f)})
        else:
            return Dataset(f[np.newaxis])
Exemple #4
0
def f_oneway_repeated_measures(M):
    """Calculate One-Way ANOVA for repeated measures. 
    Models the difference between 'subjects' as random effect.
    Example code from Roger Lew:
    ---------------
    import numpy as np
    from scipy.stats import fprob

    # M contains subjects as rows and conditions as columns
    M=np.array([[21,22,8,6,6],
                [20,19,10,4,4],
                [17,15,5,4,5],
                [25,30,13,12,17],
                [30,27,13,8,6],
                [19,27,8,7,4],
                [26,16,5,2,5],
                [17,18,8,1,5],
                [26,24,14,8,9]],dtype='float')

    mu=np.mean(M)
    SS_total=np.sum([[(v-mu)**2 for v in row] for row in M])
    SS_subjects=np.sum([ M.shape[1]*(np.mean(row)-mu)**2 for row in M])
    SS_conditions=np.sum([ M.shape[0]*(np.mean(row)-mu)**2 for row in M.T])
    SS_error=SS_total-SS_subjects-SS_conditions

    df_total=M.size-1
    df_conditions=M.shape[1]-1
    df_subjects=M.shape[0]-1
    df_error=df_total-df_subjects-df_conditions

    F=(SS_conditions/df_conditions)/(SS_error/df_error)
    p=fprob(df_conditions,df_error,F)
    print F,p
    ------------------
    :Parameters:
      M: array-like, 2d
        The array containing the data, 2d, subjects as rows, 
        conditions as columns
    """
    mu=np.mean(M)
    SS_total=np.sum([[(v-mu)**2 for v in row] for row in M])
    SS_subjects=np.sum([ M.shape[1]*(np.mean(row)-mu)**2 for row in M])
    SS_conditions=np.sum([ M.shape[0]*(np.mean(row)-mu)**2 for row in M.T])
    SS_error=SS_total-SS_subjects-SS_conditions

    df_total=M.size-1
    df_conditions=M.shape[1]-1
    df_subjects=M.shape[0]-1
    df_error=df_total-df_subjects-df_conditions

    F=(SS_conditions/df_conditions)/(SS_error/df_error)
    p=fprob(df_conditions,df_error,F)
    return F,p
Exemple #5
0
def f_oneway_repeated_measures(M):
    """Calculate One-Way ANOVA for repeated measures. 
    Models the difference between 'subjects' as random effect.
    Example code from Roger Lew:
    ---------------
    import numpy as np
    from scipy.stats import fprob

    # M contains subjects as rows and conditions as columns
    M=np.array([[21,22,8,6,6],
                [20,19,10,4,4],
                [17,15,5,4,5],
                [25,30,13,12,17],
                [30,27,13,8,6],
                [19,27,8,7,4],
                [26,16,5,2,5],
                [17,18,8,1,5],
                [26,24,14,8,9]],dtype='float')

    mu=np.mean(M)
    SS_total=np.sum([[(v-mu)**2 for v in row] for row in M])
    SS_subjects=np.sum([ M.shape[1]*(np.mean(row)-mu)**2 for row in M])
    SS_conditions=np.sum([ M.shape[0]*(np.mean(row)-mu)**2 for row in M.T])
    SS_error=SS_total-SS_subjects-SS_conditions

    df_total=M.size-1
    df_conditions=M.shape[1]-1
    df_subjects=M.shape[0]-1
    df_error=df_total-df_subjects-df_conditions

    F=(SS_conditions/df_conditions)/(SS_error/df_error)
    p=fprob(df_conditions,df_error,F)
    print F,p
    ------------------
    :Parameters:
      M: array-like, 2d
        The array containing the data, 2d, subjects as rows, 
        conditions as columns
    """
    mu = np.mean(M)
    SS_total = np.sum([[(v - mu) ** 2 for v in row] for row in M])
    SS_subjects = np.sum([M.shape[1] * (np.mean(row) - mu) ** 2 for row in M])
    SS_conditions = np.sum([M.shape[0] * (np.mean(row) - mu) ** 2 for row in M.T])
    SS_error = SS_total - SS_subjects - SS_conditions

    df_total = M.size - 1
    df_conditions = M.shape[1] - 1
    df_subjects = M.shape[0] - 1
    df_error = df_total - df_subjects - df_conditions

    F = (SS_conditions / df_conditions) / (SS_error / df_error)
    p = fprob(df_conditions, df_error, F)
    return F, p
 def __call__(self, dataset, labels=None):
     """Actually calculate the p-values."""
     f = OneWayAnova()(dataset)
     # number of groups 
     if labels is None: 
         labels = dataset.labels
     # Calculate degrees of freedom
     ul = np.unique(labels)
     na = len(ul) 
     bign = float(dataset.nsamples)
     dfbn = na-1 
     dfwn = bign - na
     # Now propabilities
     ps = fprob(dfbn,dfwn,f)
     return ps
Exemple #7
0
def compare_models(c1, c2):
    """ Compares if classifiaction model c1 is significantly better
    than model c2. The comparison is based on F-test, the p-value
    is returned.

    :param c1, c2: linear regression model objects.
    :type lr: :class:`LinearRegression`

    """
    if c1 == None or c2 == None:
        return 1.0
    p1, p2, n = c1.m, c2.m, c1.n
    RSS1, RSS2 = c1.sse, c2.sse
    if RSS1 <= RSS2 or p2 <= p1 or n <= p2 or RSS2 <= 0:
        return 1.0
    F = ((RSS1 - RSS2) / (p2 - p1)) / (RSS2 / (n - p2))
    return stats.fprob(int(p2 - p1), int(n - p2), F)
Exemple #8
0
 def __init__(self, *args):
     super(ANOVA, self).__init__()
     samples = [np.asarray(x) for x in args]
     all_samples = np.concatenate(samples)
     self.grand_mean = np.mean(all_samples)
     self.sst = np.sum([(x - self.grand_mean)**2 for x in all_samples])
     self.ssb = np.sum(
         [len(x) * (np.mean(x) - self.grand_mean)**2 for x in samples])
     self.ssw = self.sst - self.ssb
     self.N = len(all_samples)
     self.k = len(samples)
     self.ssbdf = self.k - 1
     self.sswdf = self.N - self.k
     self.mssb = self.ssb / self.ssbdf
     self.mssw = self.ssw / self.sswdf
     self.fstat = self.mssb / self.mssw
     self.pvalue = fprob(self.ssbdf, self.sswdf, self.fstat)
Exemple #9
0
def compare_models(c1, c2):
    """ Compares if classifiaction model c1 is significantly better
    than model c2. The comparison is based on F-test, the p-value
    is returned.

    :param c1, c2: linear regression model objects.
    :type lr: :class:`LinearRegression`     

    """
    if c1 == None or c2 == None:
        return 1.0
    p1, p2, n = c1.m, c2.m, c1.n
    RSS1, RSS2 = c1.sse, c2.sse
    if RSS1 <= RSS2 or p2 <= p1 or n <= p2 or RSS2 <= 0:
        return 1.0
    F = ((RSS1-RSS2)/(p2-p1))/(RSS2/(n-p2))
    return stats.fprob(int(p2-p1), int(n-p2), F)
Exemple #10
0
 def __init__(self, *args):
     super(ANOVA, self).__init__()
     samples = [np.asarray(x) for x in args]
     all_samples = np.concatenate(samples)
     self.grand_mean = np.mean(all_samples)
     self.sst = np.sum([(x - self.grand_mean)**2 for x in all_samples])
     self.ssb = np.sum(
                 [len(x)*(np.mean(x) - self.grand_mean)**2 for x in samples]
                 ) 
     self.ssw = self.sst - self.ssb
     self.N = len(all_samples)
     self.k = len(samples) 
     self.ssbdf = self.k - 1
     self.sswdf = self.N - self.k
     self.mssb = self.ssb / self.ssbdf
     self.mssw = self.ssw / self.sswdf
     self.fstat = self.mssb / self.mssw
     self.pvalue = fprob(self.ssbdf, self.sswdf, self.fstat) 
Exemple #11
0
def repeated_oneway(data):

    n = data.shape[0]
    k = data.shape[1]
    grand_mean = np.mean(data)
    measurement_mean = np.mean(data, axis=0)
    subject_mean = np.mean(data, axis=1)
    ssb = n * st.ss(measurement_mean - grand_mean)
    #   ssw = st.ss(data-measurement_mean)
    ssw = np.sum(st.ss(data - measurement_mean))
    sss = k * st.ss(subject_mean - grand_mean)
    sse = ssw - sss
    dfb = k - 1
    dfe = (n - 1) * (k - 1)
    msb = ssb / float(dfb)
    mse = sse / float(dfe)
    f = msb / mse
    p = st.fprob(dfb, dfe, f)
    return f, p
Exemple #12
0
def repeated_oneway(data) :

  n = data.shape[0]
  k = data.shape[1]
  grand_mean = np.mean(data)
  measurement_mean = np.mean(data,axis=0)
  subject_mean = np.mean(data,axis=1)
  ssb = n*st.ss(measurement_mean-grand_mean)
#   ssw = st.ss(data-measurement_mean)
  ssw = np.sum(st.ss(data-measurement_mean))
  sss = k*st.ss(subject_mean-grand_mean)
  sse = ssw-sss
  dfb = k - 1
  dfe = (n-1)*(k-1)
  msb = ssb / float(dfb)
  mse = sse / float(dfe)
  f = msb / mse
  p = st.fprob(dfb,dfe,f)
  return f,p
  
def _f_oneway(*args):
    """
    Performs a 1-way ANOVA.

    The one-way ANOVA tests the null hypothesis that 2 or more groups have
    the same population mean. The test is applied to samples from two or
    more groups, possibly with differing sizes.

    Parameters
    ----------
    sample1, sample2, ... : array_like
        The sample measurements should be given as arguments.

    Returns
    -------
    F-value : float
        The computed F-value of the test
    p-value : float
        The associated p-value from the F-distribution

    Notes
    -----
    The ANOVA test has important assumptions that must be satisfied in order
    for the associated p-value to be valid.

    1. The samples are independent
    2. Each sample is from a normally distributed population
    3. The population standard deviations of the groups are all equal.  This
       property is known as homocedasticity.

    If these assumptions are not true for a given set of data, it may still be
    possible to use the Kruskal-Wallis H-test (`stats.kruskal`_) although with
    some loss of power

    The algorithm is from Heiman[2], pp.394-7.

    See scipy.stats.f_oneway that should give the same results while
    being less efficient

    References
    ----------
    .. [1] Lowry, Richard.  "Concepts and Applications of Inferential
           Statistics". Chapter 14.
           http://faculty.vassar.edu/lowry/ch14pt1.html

    .. [2] Heiman, G.W.  Research Methods in Statistics. 2002.

    """
    n_classes = len(args)
    n_samples_per_class = np.array([len(a) for a in args])
    n_samples = np.sum(n_samples_per_class)
    ss_alldata = reduce(lambda x, y: x + y,
                        [np.sum(a ** 2, axis=0) for a in args])
    sums_args = [np.sum(a, axis=0) for a in args]
    square_of_sums_alldata = reduce(lambda x, y: x + y, sums_args) ** 2
    square_of_sums_args = [s ** 2 for s in sums_args]
    sstot = ss_alldata - square_of_sums_alldata / float(n_samples)
    ssbn = 0
    for k, _ in enumerate(args):
        ssbn += square_of_sums_args[k] / n_samples_per_class[k]
    ssbn -= square_of_sums_alldata / float(n_samples)
    sswn = sstot - ssbn
    dfbn = n_classes - 1
    dfwn = n_samples - n_classes
    msb = ssbn / float(dfbn)
    msw = sswn / float(dfwn)
    f = msb / msw
    prob = stats.fprob(dfbn, dfwn, f)
    return f, prob
Exemple #14
0
 def err(p,fp_ref):
     #print p,fp_ref
     return abs(fp_ref-fprob(df1,df2,p[0]))
def f_oneway(*args):
    """Performs a 1-way ANOVA.

    The one-way ANOVA tests the null hypothesis that 2 or more groups have
    the same population mean. The test is applied to samples from two or
    more groups, possibly with differing sizes.

    Parameters
    ----------
    sample1, sample2, ... : array_like, sparse matrices
        The sample measurements should be given as arguments.

    Returns
    -------
    F-value : float
        The computed F-value of the test.
    p-value : float
        The associated p-value from the F-distribution.

    Notes
    -----
    The ANOVA test has important assumptions that must be satisfied in order
    for the associated p-value to be valid.

    1. The samples are independent
    2. Each sample is from a normally distributed population
    3. The population standard deviations of the groups are all equal. This
       property is known as homoscedasticity.

    If these assumptions are not true for a given set of data, it may still be
    possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`_) although
    with some loss of power.

    The algorithm is from Heiman[2], pp.394-7.

    See ``scipy.stats.f_oneway`` that should give the same results while
    being less efficient.

    References
    ----------

    .. [1] Lowry, Richard.  "Concepts and Applications of Inferential
           Statistics". Chapter 14.
           http://faculty.vassar.edu/lowry/ch14pt1.html

    .. [2] Heiman, G.W.  Research Methods in Statistics. 2002.

    """
    n_classes = len(args)
    args = [safe_asarray(a) for a in args]
    n_samples_per_class = np.array([a.shape[0] for a in args])
    n_samples = np.sum(n_samples_per_class)
    ss_alldata = reduce(lambda x, y: x + y,
                        [safe_sqr(a).sum(axis=0) for a in args])
    sums_args = [a.sum(axis=0) for a in args]
    square_of_sums_alldata = safe_sqr(reduce(lambda x, y: x + y, sums_args))
    square_of_sums_args = [safe_sqr(s) for s in sums_args]
    sstot = ss_alldata - square_of_sums_alldata / float(n_samples)
    ssbn = 0.
    for k, _ in enumerate(args):
        ssbn += square_of_sums_args[k] / n_samples_per_class[k]
    ssbn -= square_of_sums_alldata / float(n_samples)
    sswn = sstot - ssbn
    dfbn = n_classes - 1
    dfwn = n_samples - n_classes
    msb = ssbn / float(dfbn)
    msw = sswn / float(dfwn)
    f = msb / msw
    # flatten matrix to vector in sparse case
    f = np.asarray(f).ravel()
    prob = stats.fprob(dfbn, dfwn, f)
    return f, prob
def f_oneway(*args):
    """Performs a 1-way ANOVA.

    The one-way ANOVA tests the null hypothesis that 2 or more groups have
    the same population mean. The test is applied to samples from two or
    more groups, possibly with differing sizes.

    Parameters
    ----------
    sample1, sample2, ... : array_like, sparse matrices
        The sample measurements should be given as arguments.

    Returns
    -------
    F-value : float
        The computed F-value of the test.
    p-value : float
        The associated p-value from the F-distribution.

    Notes
    -----
    The ANOVA test has important assumptions that must be satisfied in order
    for the associated p-value to be valid.

    1. The samples are independent
    2. Each sample is from a normally distributed population
    3. The population standard deviations of the groups are all equal. This
       property is known as homoscedasticity.

    If these assumptions are not true for a given set of data, it may still be
    possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`_) although
    with some loss of power.

    The algorithm is from Heiman[2], pp.394-7.

    See ``scipy.stats.f_oneway`` that should give the same results while
    being less efficient.

    References
    ----------

    .. [1] Lowry, Richard.  "Concepts and Applications of Inferential
           Statistics". Chapter 14.
           http://faculty.vassar.edu/lowry/ch14pt1.html

    .. [2] Heiman, G.W.  Research Methods in Statistics. 2002.

    """
    n_classes = len(args)
    args = [as_float_array(a) for a in args]
    n_samples_per_class = np.array([a.shape[0] for a in args])
    n_samples = np.sum(n_samples_per_class)
    ss_alldata = sum(safe_sqr(a).sum(axis=0) for a in args)
    sums_args = [np.asarray(a.sum(axis=0)) for a in args]
    square_of_sums_alldata = sum(sums_args) ** 2
    square_of_sums_args = [s ** 2 for s in sums_args]
    sstot = ss_alldata - square_of_sums_alldata / float(n_samples)
    ssbn = 0.
    for k, _ in enumerate(args):
        ssbn += square_of_sums_args[k] / n_samples_per_class[k]
    ssbn -= square_of_sums_alldata / float(n_samples)
    sswn = sstot - ssbn
    dfbn = n_classes - 1
    dfwn = n_samples - n_classes
    msb = ssbn / float(dfbn)
    msw = sswn / float(dfwn)
    constant_features_idx = np.where(msw == 0.)[0]
    if (np.nonzero(msb)[0].size != msb.size and constant_features_idx.size):
        warnings.warn("Features %s are constant." % constant_features_idx,
                      UserWarning)
    f = msb / msw
    # flatten matrix to vector in sparse case
    f = np.asarray(f).ravel()
    prob = stats.fprob(dfbn, dfwn, f)
    return f, prob
Exemple #17
0
def ftest(Ernull, Eralt):
    """
    Extra-sum-square F test for two groups.
    This function allows to compare two NESTED models using the F test
    The Extra sum-of-squares F test is based on the difference between the
    sum of squares (residuals) of the two models. It also takes into account
    the number od data points and the number os parameters of each model
    (penalizes the more complicated model). It uses this information to compute
    the F ratio, from which it calculates a P value. If the simpler model (few
    parameters) is "better" P value is greater than 0.05 (5%), otherwise, if
    the more complicated model is "better" then the P value will be less than
    0.05.

    Parameters
    ----------
    Ernull : array_like
             residuals from the simpler model (null hypothesis).
    Eralt : array_like
             residuals from the more complicated model (alternative hypothesis).

    Returns
    -------
    fRatio : float
             F test from the 2 groups.
    pValue : float
             P value from F dist.


    References
    ----------

    .. [1] H.J.Motulsky  and A Christopoulos, Fitting Model to Biological Data
    using Linear and Nolinear Regression: A pratical guide to curve fitting.
    2003, GraphPad Software inc., San Diego CA, www.graphpad.com

    .. [2]  http://vassarstats.net/textbook/ch14pt1.html

    """
    sAnull = sum(Ernull)
    Nnull = len(Ernull)
    Nalt = len(Eralt)
    sAalt = sum(Eralt)
    s2Anull = sum(Ernull**2)
    s2Aalt = sum(Eralt**2)
    Mnull = mean(Ernull)  # Mean of group 1.
    Malt = mean(Eralt)  # Mean of group 2.
    SSnull = s2Anull - (sAnull**2) / Nnull
    SSalt = s2Aalt - (sAalt**2) / Nalt
    Mt = mean([Mnull, Malt])
    Nt = Nnull + Nalt
    SSwg = SSnull + SSalt  # Variability that exists inside 2 groups.
    SSbg = Nnull * (Mnull - Mt)**2 + Nalt * (Malt - Mt)**2  # measure of the
    # aggregate differences among the means of the 2 groups.
    dfbg = 1
    dfwg = (Nnull - 1) + (Nalt - 1)  # Degree of freedom of the 2 groups.
    dft = Nt - 2  # Number of degrees of freedom for the entire data.
    MSbg = SSbg / dfbg
    MSwg = SSwg / dfwg
    fRatio = MSbg / MSwg  # F Ratio.
    pValue = fprob(SSnull, SSalt, fRatio)
    return fRatio, pValue
Exemple #18
0
 def err(p, fp_ref):
     #print p,fp_ref
     return abs(fp_ref - fprob(df1, df2, p[0]))