Ejemplo n.º 1
0
    def test_init_var_indicators(self):
        # Test forward
        forward = True
        var1_index = 1
        var2_index = 2
        samp_var1 = np.array([[x for x in range(5)] for x in range(5)])
        samp_var2 = np.array([[x for x in range(5)] for x in range(5)])
        exceeds, reverse, extrema_p, extrema_r, var1, var2 = utils.init_var_indicators(
            var1_index, var2_index, samp_var1, samp_var2, forward)
        assert np.all(exceeds == 0)
        assert np.all(reverse == 0)
        assert np.all(extrema_p == 0)
        assert np.all(extrema_r == 1, axis=0)
        assert np.all(var1 == 1)
        assert np.all(var2 == 2)

        # Test reverse
        forward = False
        var1_index = 4
        var2_index = 3
        samp_var1 = np.array([[x for x in range(5)] for x in range(5)])
        samp_var2 = np.array([[x for x in range(5)] for x in range(5)])
        exceeds, reverse, extrema_p, extrema_r, var1, var2 = utils.init_var_indicators(
            var1_index, var2_index, samp_var1, samp_var2, forward)
        assert np.all(exceeds == 0)
        assert np.all(reverse == 0)
        assert np.all(extrema_p == 1)
        assert np.all(extrema_r == 0)
        assert np.all(var1 == 4)
        assert np.all(var2 == 3)
Ejemplo n.º 2
0
    def test_update_rev_extrema_rp(self):
        # tests updating of indicator arrays
        exceeds, reverse, extrema_p, extrema_r, var1, var2 = utils.init_var_indicators(
            self.var1, self.var2, self.samp_var1, self.samp_var2, self.forward)

        combs = [list(x) for x in itertools.combinations(range(self.n_samp), self.resample_k)]
        for indices in combs:
            # ~ operator negates the output in the case of a boolean array
            new_var1 = var1[~np.in1d(range(len(var1)), indices)]
            new_var2 = var2[~np.in1d(range(len(var2)), indices)]

            p_value, r_value = statistics.compute_pc(new_var1, new_var2)

            assert_almost_equal(self.update_results[str(indices)], statistics.update_rev_extrema_rp(
                self.sign, r_value, p_value, indices, reverse, extrema_p, extrema_r, self.forward), decimal=5)
Ejemplo n.º 3
0
def resamplek_cutie(var1_index, var2_index, n_samp, samp_var1, samp_var2,
                    pvalues, corrs, threshold, resample_k, sign, forward,
                    statistic, fold, fold_value, param):
    """
    Perform CUTIE resampling on a given pair of variables and test CUTIE status.
    ----------------------------------------------------------------------------
    INPUTS
    var1_index        - Integer. Index of variable in file 1.
    var2_index        - Integer. Index of variable in file 2.
    n_samp            - Integer. Number of samples.
    samp_var1         - 2D array. Each value in row i col j is the level of
                        variable j corresponding to sample i in the order that
                        the samples are presented in samp_ids when parsed.
    samp_var2         - 2D array. Same as samp_var1 but for file 2.
    pvalues           - 2D array. Entry row i, col j represents p value of
                        correlation between i-th var1 and j-th var2.
    corrs             - 2D array. Contains values of correlation strength
                        between var i and var j.
    threshold         - Float. Level of significance testing (after adjusting
                        for multiple comparisons)
    sign              - Integer. -1 or 1, depending on original sign of
                        correlation to check against following re-evaluation.
    forward           - Boolean. True if CUTIE is run in the forward direction,
                        False if reverse.
    statistic         - String. Describes analysis being performed.
    fold              - Boolean. Determines whether you require the new P value
                        to be a certain fold greater to be classified as a CUTIE.
    fold_value        - Float. Determines fold difference constraint imposed on
                        the resampled p-value needed for a correlation to be
                        classified as a CUTIE.
    param             - String. Either 'r' or 'p' depending on whether r value or p
                        value will be used to filter correlations.
    OUTPUTS
    reverse           - 1D array. Index i is 1 if the correlation changes sign
                        upon removing sample i.
    exceeds           - 1D array. Index i is 1 if removing that sample causes
                        the correlation to become insignificant in at least 1
                        different pairwise correlations
    extrema_p         - 1D array. Length n_samp, contains lowest or highest p
                        value observed thusfar for a particular sample,
                        depending if reverse or forward CUTIE was run
                        respectively across all i in {1,...,k} iterations of
                        CUTIE_k.
    extrema_r         - 1D array. Same as extrema_p but for R / correlation
                        strength values.
    """
    # initialize indicators and variables
    exceeds, reverse, extrema_p, extrema_r, var1, var2 = utils.init_var_indicators(
        var1_index, var2_index, samp_var1, samp_var2, forward)

    # iteratively delete k samples and recompute statistics
    combs = [
        list(x) for x in itertools.combinations(range(n_samp), resample_k)
    ]
    for indices in combs:
        new_var1 = var1[~np.in1d(range(len(var1)), indices)]
        new_var2 = var2[~np.in1d(range(len(var2)), indices)]

        # remove NaNs
        new_var1, new_var2 = utils.remove_nans(new_var1, new_var2)

        # compute new p_value and r_value depending on statistic
        if statistic in ('pearson', 'rpearson'):
            p_value, r_value = compute_pc(new_var1, new_var2)
        elif statistic in ('spearman', 'rspearman'):
            p_value, r_value = compute_sc(new_var1, new_var2)
        elif statistic in ('kendall', 'rkendall'):
            p_value, r_value = compute_kc(new_var1, new_var2)

        # update reverse, maxp, and minr
        reverse, extrema_p, extrema_r = update_rev_extrema_rp(
            sign, r_value, p_value, indices, reverse, extrema_p, extrema_r,
            forward)

        # check sign reversal
        if np.sign(r_value) != sign:
            for i in indices:
                reverse[i] += 1

        if forward is True:
            if param == 'p':
                # fold change p-value restraint
                if fold:
                    if (p_value > threshold and
                        p_value > pvalues[var1_index][var2_index] * fold_value) or \
                            np.isnan(p_value):
                        for i in indices:
                            exceeds[i] += 1
                elif p_value > threshold or np.isnan(p_value):
                    for i in indices:
                        exceeds[i] += 1
            elif param == 'r':
                # fold change r-value restraint
                if fold:
                    if (np.abs(r_value) < threshold and
                        np.abs(r_value) < np.abs(corrs[var1_index][var2_index]) * fold_value) or \
                            np.isnan(r_value):
                        for i in indices:
                            exceeds[i] += 1
                elif np.abs(r_value) < threshold or np.isnan(r_value):
                    for i in indices:
                        exceeds[i] += 1

        elif forward is False:
            if param == 'p':
                # fold change p-value restraint
                if fold:
                    if (p_value < threshold and p_value <
                            pvalues[var1_index][var2_index] / fold_value):
                        for i in indices:
                            exceeds[i] += 1
                elif p_value < threshold:
                    for i in indices:
                        exceeds[i] += 1
            elif param == 'r':
                # fold change p-value restraint
                if fold:
                    if (np.abs(r_value) > threshold and
                        np.abs(r_value) > np.abs(corrs[var1_index][var2_index]) * fold_value) or \
                            np.isnan(r_value):
                        for i in indices:
                            exceeds[i] += 1
                elif np.abs(r_value) > threshold or np.isnan(r_value):
                    for i in indices:
                        exceeds[i] += 1

    return reverse, exceeds, extrema_p, extrema_r
Ejemplo n.º 4
0
def resample1_cutie_pc(var1_index, var2_index, samp_var1, samp_var2, **kwargs):
    """
    Takes a given var1 and var2 by indices and recomputes Pearson correlation
    by removing 1 out of n (sample_size) points from samp_ids. (UT in test_pointwise_metrics)
    ----------------------------------------------------------------------------
    INPUTS
    var1_index - Integer. Index for variable from file 1 in pairwise correlation.
    var2_index - Integer. Index for variable from file 2 in pairwise correlation.
    samp_var1  - 2D array. Each value in row i col j is the level of variable j
                 corresponding to sample i in the order that the samples are
                 presented in samp_ids.
    samp_var2  - 2D array. Same as samp_var1 but for file 2.
    **kwargs:
    threshold  - Float. Level of significance testing (after adjusting for
                 multiple comparisons).
    fold       - Boolean. Determines whether you require the new P value to be a
                 certain fold greater to be classified as a CUTIE.
    fold_value - Float. Determines fold difference constraint imposed on the
                 resampled p-value needed for a correlation to be classified as
                 a CUTIE.
    param      - String. Either 'r' or 'p' depending on whether r value or p
                 value will be used to filter correlations.

    OUTPUTS
    reverse    - 1D array. Index i is 1 if the correlation changes sign upon
                 removing sample i.
    exceeds    - 1D array. Index i is 1 if removing that sample causes the
                 correlation to become insignificant in at least 1 different
                 pairwise correlations.
    corrs      - 1D array. Contains values of correlation strength with sample i
                 removed.
    p_values   - 1D array. Contains values of pvalues with sample i removed.
    """
    n_samp = samp_var1.shape[0]

    exceeds, reverse, maxp, minr, var1, var2 = \
        utils.init_var_indicators(var1_index, var2_index, samp_var1, samp_var2, True)

    corrs = np.zeros(n_samp)
    p_values = np.zeros(n_samp)

    # iteratively delete one sample and recompute statistics
    original_r, original_p = compute_pc(var1, var2)

    for s in range(n_samp):
        new_var1 = var1[~np.in1d(range(n_samp), s)]
        new_var2 = var2[~np.in1d(range(n_samp), s)]

        # compute new p_value and r_value
        r_value, p_value = compute_pc(new_var1, new_var2)

        # update reverse, maxp, and minr
        # sign is artificially 0 since we are not interested in that
        # Forward is True since we only apply Cook's D to TP/FP separation
        reverse, maxp, minr = update_rev_extrema_rp(0, r_value, p_value, [s],
                                                    reverse, maxp, minr, True)
        if kwargs['param'] == 'p':
            if kwargs['fold']:
                if (p_value > kwargs['threshold'] and \
                    p_value > original_p * kwargs['fold_value']) or \
                    np.isnan(p_value):
                    exceeds[s] += 1
            elif p_value > kwargs['threshold'] or np.isnan(p_value):
                exceeds[s] += 1

        elif kwargs['param'] == 'r':
            if kwargs['fold']:
                if (np.abs(r_value) < kwargs['threshold'] and \
                    np.abs(r_value) < np.abs(original_r) * kwargs['fold_value']) or \
                    np.isnan(r_value):
                    exceeds[s] += 1
            elif np.abs(r_value) < kwargs['threshold'] or np.isnan(r_value):
                exceeds[s] += 1

        corrs[s] = r_value
        p_values[s] = p_value

    return reverse, exceeds, corrs, p_values