Exemple #1
1
def estat(x, y, nboot=1000, replace=False, method='log', fitting=False):
    '''
    Energy distance statistics test.
    Reference
    ---------
    Aslan, B, Zech, G (2005) Statistical energy as a tool for binning-free
      multivariate goodness-of-fit tests, two-sample comparison and unfolding.
      Nuc Instr and Meth in Phys Res A 537: 626-636
    Szekely, G, Rizzo, M (2014) Energy statistics: A class of statistics
      based on distances. J Stat Planning & Infer 143: 1249-1272
    Brian Lau, multdist, https://github.com/brian-lau/multdist
    '''
    n, N = len(x), len(x) + len(y)
    stack = np.vstack([x, y])
    stack = (stack - stack.mean(0)) / stack.std(0)
    if replace:
        rand = lambda x: random.randint(x, size=x)
    else:
        rand = random.permutation

    en = energy(stack[:n], stack[n:], method)
    en_boot = np.zeros(nboot, 'f')
    for i in range(nboot):
        idx = rand(N)
        en_boot[i] = energy(stack[idx[:n]], stack[idx[n:]], method)

    if fitting:
        param = genextreme.fit(en_boot)
        p = genextreme.sf(en, *param)
        return p, en, param
    else:
        p = (en_boot >= en).sum() / nboot
        return p, en, en_boot
Exemple #2
0
 def _compare_resamples(self, tvalues, null_max_tvalues, null_min_tvalues):
     pvalues = []
     maxparams = genextreme.fit(null_max_tvalues)
     minparams = genextreme.fit([-x for x in null_min_tvalues])
     for tvalue in tvalues:
         pvalue = genextreme.sf(tvalue, *maxparams) if tvalue >= 0 else genextreme.sf(-tvalue, *minparams)
         pvalues.append(pvalue)
     return pvalues
Exemple #3
0
def estat(x, y, nboot=1000, replace=False, method='log', fitting=False):
    '''
    Energy distance statistics test.
    Reference
    ---------
    Aslan, B, Zech, G (2005) Statistical energy as a tool for binning-free
      multivariate goodness-of-fit tests, two-sample comparison and unfolding.
      Nuc Instr and Meth in Phys Res A 537: 626-636
    Szekely, G, Rizzo, M (2014) Energy statistics: A class of statistics
      based on distances. J Stat Planning & Infer 143: 1249-1272
    Brian Lau, multdist, https://github.com/brian-lau/multdist

    '''
    n, N = len(x), len(x) + len(y)
    stack = np.vstack([x, y])
    stack = (stack - stack.mean(0)) / stack.std(0)
    if replace:
        rand = lambda x: random.randint(x, size=x)
    else:
        rand = random.permutation

    en = energy(stack[:n], stack[n:], method)
    en_boot = np.zeros(nboot, 'f')
    for i in range(nboot):
        idx = rand(N)
        en_boot[i] = energy(stack[idx[:n]], stack[idx[n:]], method)

    if fitting:
        param = genextreme.fit(en_boot)
        p = genextreme.sf(en, *param)
        return p, en, param
    else:
        p = (en_boot >= en).sum() / nboot
        return p, en, en_boot
Exemple #4
0
def extreme_values(weighted_residuals, confidence_interval):
    '''
    This function uses extreme value theory to calculate the number of
    standard deviations away from the mean at which we should expect to bracket
    *all* of our n data points at a certain confidence level.

    It then uses that value to identify which (if any) of the data points
    lie outside that region, and calculates the corresponding probabilities
    of finding a data point at least that many standard deviations away.


    Parameters
    ----------

    weighted_residuals : array of floats
        Array of residuals weighted by the square root of their
        variances wr_i = r_i/sqrt(var_i)

    confidence_interval : float
        Probability at which all the weighted residuals lie
        within the confidence bounds

    Returns
    -------
    confidence_bound : float
        Number of standard deviations at which we should expect to encompass
        all data at the user-defined confidence interval.

    indices : array of floats
        Indices of weighted residuals exceeding the confidence_interval
        defined by the user

    probabilities : array of floats
        The probabilities that the extreme data point of the distribution lies
        further from the mean than the observed position wr_i for each i in
        the "indices" output array.
    '''

    n = len(weighted_residuals)
    mean = norm.isf(1./n)
    # good approximation for > 10 data points
    scale = 0.8/np.power(np.log(n), 1./2.)
    # good approximation for > 10 data points
    c = 0.33/np.power(np.log(n), 3./4.)

    # We now need a 1-tailed probability from the given confidence_interval
    # p_total = 1. - confidence_interval = p_upper + p_lower - p_upper*p_lower
    # p_total = 1. - confidence_interval = 2p - p^2, therefore:
    p = 1. - np.sqrt(confidence_interval)
    confidence_bound = genextreme.isf(p, c, loc=mean, scale=scale)

    indices = [i for i, r in enumerate(weighted_residuals)
               if np.abs(r) > confidence_bound]
    # Convert back to 2-tailed probabilities
    probabilities = (1.
                     - np.power(genextreme.sf(np.abs(weighted_residuals[indices]),
                                              c, loc=mean, scale=scale) - 1., 2.))

    return confidence_bound, indices, probabilities
Exemple #5
0
def estat(x,
          y,
          nboot=1000,
          maxt=60.,
          replace=False,
          method='log',
          fitting=False):
    """
    Energy distance statistics test.

    References
    ----------

    * Aslan, B, Zech, G (2005) Statistical energy as a tool for binning-free
      multivariate goodness-of-fit tests, two-sample comparison and unfolding.
      Nuc Instr and Meth in Phys Res A 537: 626-636

    * Szekely, G, Rizzo, M (2014) Energy statistics: A class of statistics
      based on distances. J Stat Planning & Infer 143: 1249-1272

    * Brian Lau, multdist, https://github.com/brian-lau/multdist

    """

    n, N = len(x), len(x) + len(y)
    stack = np.vstack([x, y])
    # stack = (stack - stack.mean(0)) / stack.std(0)
    stack = (stack - np.nanmean(stack, 0)) / np.nanstd(stack, 0)
    if replace:

        def rand(x):
            return np.random.randint(x, size=x)

        # rand = lambda x: np.random.randint(x, size=x)
    else:
        rand = np.random.permutation

    en = energy(stack[:n], stack[n:], method)
    en_boot = np.zeros(nboot, 'f')
    s = t.time()
    for i in range(nboot):
        idx = rand(N)
        en_boot[i] = energy(stack[idx[:n]], stack[idx[n:]], method)
        if t.time() - s > maxt:
            print("Time consumed, exit bootstrap (N={})".format(i))
            en_boot, nboot = en_boot[:i], i + 1
            break

    if fitting:
        param = genextreme.fit(en_boot)
        p = genextreme.sf(en, *param)
        return p, en, param
    else:
        p = (en_boot >= en).sum() / nboot
        return p, en, en_boot
def extreme_values(weighted_residuals, confidence_interval):
    '''
    This function uses extreme value theory to calculate the number of 
    standard deviations away from the mean at which we should expect to bracket
    *all* of our n data points at a certain confidence level. 
    
    It then uses that value to identify which (if any) of the data points 
    lie outside that region, and calculates the corresponding probabilities 
    of finding a data point at least that many standard deviations away.  


    Parameters
    ----------

    weighted_residuals : array of floats
        Array of residuals weighted by the square root of their
        variances wr_i = r_i/sqrt(var_i)

    confidence_interval : float
        Probability at which all the weighted residuals lie 
        within the confidence bounds

    Returns
    -------
    confidence_bound : float
        Number of standard deviations at which we should expect to encompass all 
        data at the user-defined confidence interval.

    indices : array of floats
        Indices of weighted residuals exceeding the confidence_interval 
        defined by the user

    probabilities : array of floats
        The probabilities that the extreme data point of the distribution lies
        further from the mean than the observed position wr_i for each i in
        the "indices" output array.
    '''

    n=len(weighted_residuals)
    mean = norm.isf(1./n)
    scale = 0.8/np.power(np.log(n), 1./2.) # good approximation for > 10 data points
    c = 0.33/np.power(np.log(n), 3./4.)  # good approximation for > 10 data points

    # We now need a 1-tailed probability from the given confidence_interval
    # p_total = 1. - confidence_interval = p_upper + p_lower - p_upper*p_lower
    # p_total = 1. - confidence_interval = 2p - p^2, therefore:
    p = 1. - np.sqrt(confidence_interval)
    confidence_bound = genextreme.isf(p, c, loc=mean, scale=scale)

    indices = [i for i, r in enumerate(weighted_residuals) if np.abs(r) > confidence_bound]
    probabilities = 1. - np.power(genextreme.sf(np.abs(weighted_residuals[indices]), c, loc=mean, scale=scale) - 1., 2.) # Convert back to 2-tailed probabilities
    
    return confidence_bound, indices, probabilities
Exemple #7
0
def calculate_adjusted_p_values_genextreme(in_master_table, c, loc, scale,
                                           in_alpha):
    raw_p_values = list()
    for i in range(0, len(in_master_table)):
        tmp_p_val = genextreme.sf(in_master_table[i][7], c, loc, scale)
        raw_p_values.append(tmp_p_val)
        master_table[i].append(tmp_p_val)
    # adjust p-values
    if len(raw_p_values) >= 2:
        adjusted_p_values = multipletests(raw_p_values,
                                          alpha=in_alpha,
                                          method='fdr_bh',
                                          is_sorted=False)
        for i in range(0, len(adjusted_p_values[1])):
            in_master_table[i].append(adjusted_p_values[1][i])
    else:
        for i in range(0, len(in_master_table)):
            in_master_table[i].append("na")
    return in_master_table
Exemple #8
0
def estat(x, y, nboot=1000, replace=False, method='log', fitting=False):
    """
    Energy distance statistics test.

    Compares d-dimensional data from two samples using a measure based on
    statistical energy. The test is non-parametric, does not require binning
    and easily scales to arbitrary dimensions.

    The analytic distribution of the statistic is unknown, and p-values
    are estimated using a permutation procedure, which works well
    according to simulations by Aslan & Zech.

    INPUTS
    x     - [n1 x d] matrix
    y     - [n2 x d] matrix

    OPTIONAL
    flag  - 'sr', Szekely & Rizzo energy statistic
            'az', Aslan & Zech energy statistic (default)
    nboot - # of bootstrap resamples (default = 1000)
    replace - boolean for sampling with replacement (default = false)

    OUTPUTS
    p    - p-value by permutation
    e_n  - minimum energy statistic
    e_n_boot - bootstrap samples

    References
    ----------

    * Aslan, B, Zech, G (2005) Statistical energy as a tool for binning-free
      multivariate goodness-of-fit tests, two-sample comparison and unfolding.
      Nuc Instr and Meth in Phys Res A 537: 626-636

    * Szekely, G, Rizzo, M (2014) Energy statistics: A class of statistics
      based on distances. J Stat Planning & Infer 143: 1249-1272

    * Brian Lau, multdist, https://github.com/brian-lau/multdist

    """

    n, N = len(x), len(x) + len(y)
    stack = np.vstack([x, y])
    stack = (stack - stack.mean(0)) / stack.std(0)
    if replace:

        def rand(x):
            return np.random.randint(x, size=x)

        # rand = lambda x: np.random.randint(x, size=x)
    else:
        rand = np.random.permutation

    en = energy(stack[:n], stack[n:], method)
    en_boot = np.zeros(nboot, 'f')
    for i in range(nboot):
        idx = rand(N)
        en_boot[i] = energy(stack[idx[:n]], stack[idx[n:]], method)

    if fitting:
        param = genextreme.fit(en_boot)
        p = genextreme.sf(en, *param)
        return p, en, param
    else:
        p = (en_boot >= en).sum() / nboot
        return p, en, en_boot