def test_two_way_chi_sq(self):
     p = stats.two_way(np.array([[42, 32], [60, 81]])[None, :, :])
     # Test value verified using several different packages
     assert_array_almost_equal(p, 0.04753082)
Exemple #2
0
def decode(df, selected, prior=None, u=0.05):
    """
    Will require multiple comparisons correction.
    Can be used with BrainMap database as well.
    Taken from Neurosynth's meta-analysis method.
    
    Employs Benjamini-Hochberg FDR-correction
    """
    pmids = df.index.tolist()
    terms = df.columns.values
    unselected = list(set(pmids) - set(selected))
    sel_array = df.loc[selected].values
    unsel_array = df.loc[unselected].values

    n_selected = len(selected)
    n_unselected = len(unselected)

    n_selected_active = np.sum(sel_array, axis=0)
    n_unselected_active = np.sum(unsel_array, axis=0)

    n_selected_inactive = n_selected - n_selected_active
    n_unselected_inactive = n_unselected - n_unselected_active

    n_active = n_selected_active + n_unselected_active
    n_inactive = n_selected_inactive + n_unselected_inactive

    pF = n_selected / (n_selected + n_unselected)
    pA = (n_selected_active + n_unselected_active) / (n_selected +
                                                      n_unselected)
    pAgF = n_selected_active * 1.0 / n_selected  # p(term presence given in cluster)
    pAgU = n_unselected_active * 1.0 / n_unselected
    pFgA = pAgF * pF / pA  # Conditional probability (in cluster given term presence)

    # Recompute conditions with empirically derived prior
    if prior is None:
        prior = pF

    pAgF_prior = prior * pAgF + (1 - prior) * pAgU
    pFgA_prior = pAgF * prior / pAgF_prior

    # One-way chi-square test for consistency of activation
    p_fi = stats.one_way(n_selected_active, n_selected)
    sign_fi = np.sign(n_selected_active - np.mean(n_selected_active)).ravel()
    z_fi = p_to_z(p_fi, sign_fi)

    # Two-way chi-square test for specificity of activation
    cells = np.array([[n_selected_active, n_unselected_active],
                      [n_selected_inactive, n_unselected_inactive]]).T
    p_ri = stats.two_way(cells)
    sign_ri = np.sign(pAgF - pAgU).ravel()
    z_ri = p_to_z(p_ri, sign_ri)

    # FDR
    sig_fi, corr_fi, _, _ = multipletests(p_fi,
                                          alpha=u,
                                          method='fdr_bh',
                                          returnsorted=False)
    sig_ri, corr_ri, _, _ = multipletests(p_ri,
                                          alpha=u,
                                          method='fdr_bh',
                                          returnsorted=False)

    out_p = np.array([corr_fi, z_fi, pAgF_prior, corr_ri, z_ri, pFgA_prior]).T

    out_df = pd.DataFrame(columns=[
        'pForward', 'zForward', 'probForward', 'pReverse', 'zReverse',
        'probReverse'
    ],
                          data=out_p)
    out_df['Term'] = terms
    out_df = out_df[[
        'Term', 'pForward', 'zForward', 'probForward', 'pReverse', 'zReverse',
        'probReverse'
    ]]
    return out_df
Exemple #3
0
    def __init__(self,
                 dataset,
                 ids,
                 ids2=None,
                 q=0.01,
                 prior=0.5,
                 min_studies=1):
        """ Initialize a new MetaAnalysis instance and run an analysis.
        Args:
            dataset: A Dataset instance.
            ids: A list of Mappable IDs to include in the meta-analysis.
            ids2: Optional second list of Mappable IDs. If passed, the set of
                studies will be restricted to the union of ids and ids2 before
                performing the meta-analysis. This is useful for meta-analytic
                contrasts, as the resulting images will in effect identify
                regions that are reported/activated more frequently in one
                list than in the other.
            q: The FDR threshold to use when correcting for multiple
                comparisons. Set to .01 by default.
            prior: The prior to use when calculating conditional probabilities.
                This is the prior probability of a feature being used in a
                study (i.e., p(F)). For example, if set to 0.25, the analysis
                will assume that 1/4 of studies load on the target feature, as
                opposed to the empirically estimated p(F), which is len(ids) /
                total number of studies in the dataset. If prior is not passed,
                defaults to 0.5, reflecting an effort to put all terms on level
                footing and avoid undue influence of base rates (because some
                terms are much more common than others). Note that modifying
                the prior will only affect the effect size/probability maps,
                and not the statistical inference (z-score) maps.
            min_studies: Integer or float indicating which voxels to mask out
                from results due to lack of stability. If an integer is passed,
                all voxels that activate in fewer than this number of studies
                will be ignored (i.e., a value of 0 will be assigned in all
                output images). If a float in the range of 0 - 1 is passed,
                this will be interpreted as a proportion to use as the cut-off
                (e.g., passing 0.03 will exclude all voxels active in fewer
                than 3% of the entire dataset). Defaults to 1, meaning all
                voxels that activate at least one study will be kept.
        """

        self.dataset = dataset
        mt = dataset.image_table
        self.selected_ids = list(set(mt.ids) & set(ids))
        self.selected_id_indices = np.in1d(mt.ids, ids)

        # If ids2 is provided, we only use mappables explicitly in either ids or ids2.
        # Otherwise, all mappables not in the ids list are used as the control
        # condition.
        unselected_id_indices = ~self.selected_id_indices if ids2 == None \
            else np.in1d(mt.ids, ids2)

        # Calculate different count variables
        logger.debug("Calculating counts...")
        n_selected = len(self.selected_ids)
        n_unselected = np.sum(unselected_id_indices)
        n_mappables = n_selected + n_unselected

        n_selected_active_voxels = mt.data.dot(self.selected_id_indices)
        n_unselected_active_voxels = mt.data.dot(unselected_id_indices)

        # Nomenclature for variables below: p = probability, F = feature present, g = given,
        # U = unselected, A = activation. So, e.g., pAgF = p(A|F) = probability of activation
        # in a voxel if we know that the feature is present in a study.
        pF = (n_selected * 1.0) / n_mappables
        pA = np.array((mt.data.sum(axis=1) * 1.0) / n_mappables).squeeze()

        # Conditional probabilities
        logger.debug("Calculating conditional probabilities...")
        pAgF = n_selected_active_voxels * 1.0 / n_selected
        pAgU = n_unselected_active_voxels * 1.0 / n_unselected
        pFgA = pAgF * pF / pA

        # Recompute conditionals with uniform prior
        logger.debug("Recomputing with uniform priors...")
        pAgF_prior = prior * pAgF + (1 - prior) * pAgU
        pFgA_prior = pAgF * prior / pAgF_prior

        def p_to_z(p, sign):
            p = p / 2  # convert to two-tailed
            # prevent underflow
            p[p < 1e-240] = 1e-240
            # Convert to z and assign tail
            z = np.abs(norm.ppf(p)) * sign
            # Set very large z's to max precision
            z[np.isinf(z)] = norm.ppf(1e-240) * -1
            return z

        # One-way chi-square test for consistency of activation
        p_vals = stats.one_way(np.squeeze(n_selected_active_voxels),
                               n_selected)
        p_vals[p_vals < 1e-240] = 1e-240
        z_sign = np.sign(n_selected_active_voxels -
                         np.mean(n_selected_active_voxels)).ravel()
        pAgF_z = p_to_z(p_vals, z_sign)
        fdr_thresh = stats.fdr(p_vals, q)
        pAgF_z_FDR = imageutils.threshold_img(pAgF_z,
                                              fdr_thresh,
                                              p_vals,
                                              mask_out='above')

        # Two-way chi-square for specificity of activation
        cells = np.squeeze(
            np.array([[n_selected_active_voxels, n_unselected_active_voxels],
                      [
                          n_selected - n_selected_active_voxels,
                          n_unselected - n_unselected_active_voxels
                      ]]).T)
        p_vals = stats.two_way(cells)
        p_vals[p_vals < 1e-240] = 1e-240
        z_sign = np.sign(pAgF - pAgU).ravel()
        pFgA_z = p_to_z(p_vals, z_sign)
        fdr_thresh = stats.fdr(p_vals, q)
        pFgA_z_FDR = imageutils.threshold_img(pFgA_z,
                                              fdr_thresh,
                                              p_vals,
                                              mask_out='above')

        # Retain any images we may want to save or access later
        self.images = {
            'pA': pA,
            'pAgF': pAgF,
            'pFgA': pFgA,
            ('pAgF_given_pF=%0.2f' % prior): pAgF_prior,
            ('pFgA_given_pF=%0.2f' % prior): pFgA_prior,
            'consistency_z': pAgF_z,
            'specificity_z': pFgA_z,
            ('pAgF_z_FDR_%s' % q): pAgF_z_FDR,
            ('pFgA_z_FDR_%s' % q): pFgA_z_FDR
        }

        # Mask out all voxels below num_studies threshold
        if min_studies > 0:
            if isinstance(min_studies, int):
                min_studies = float(
                    min_studies) / n_mappables  # Recalculate as proportion
            vox_to_exclude = np.where(pA < min_studies)[0]  # Create mask
            # Mask each image
            for k in self.images:
                self.images[k][vox_to_exclude] = 0
Exemple #4
0
 def test_two_way_chi_sq(self):
     p = stats.two_way(np.array([[42, 32], [60, 81]])[None,:,:])
     # Test value verified using several different packages
     assert_array_almost_equal(p, 0.04753082)
Exemple #5
0
    def __init__(self, dataset, ids, ids2=None, q=0.01, prior=0.5,
                 min_studies=1):
        """ Initialize a new MetaAnalysis instance and run an analysis.
        Args:
            dataset: A Dataset instance.
            ids: A list of Mappable IDs to include in the meta-analysis.
            ids2: Optional second list of Mappable IDs. If passed, the set of
                studies will be restricted to the union of ids and ids2 before
                performing the meta-analysis. This is useful for meta-analytic
                contrasts, as the resulting images will in effect identify
                regions that are reported/activated more frequently in one
                list than in the other.
            q: The FDR threshold to use when correcting for multiple
                comparisons. Set to .01 by default.
            prior: The prior to use when calculating conditional probabilities.
                This is the prior probability of a feature being used in a
                study (i.e., p(F)). For example, if set to 0.25, the analysis
                will assume that 1/4 of studies load on the target feature, as
                opposed to the empirically estimated p(F), which is len(ids) /
                total number of studies in the dataset. If prior is not passed,
                defaults to 0.5, reflecting an effort to put all terms on level
                footing and avoid undue influence of base rates (because some
                terms are much more common than others). Note that modifying
                the prior will only affect the effect size/probability maps,
                and not the statistical inference (z-score) maps.
            min_studies: Integer or float indicating which voxels to mask out
                from results due to lack of stability. If an integer is passed,
                all voxels that activate in fewer than this number of studies
                will be ignored (i.e., a value of 0 will be assigned in all
                output images). If a float in the range of 0 - 1 is passed,
                this will be interpreted as a proportion to use as the cut-off
                (e.g., passing 0.03 will exclude all voxels active in fewer
                than 3% of the entire dataset). Defaults to 1, meaning all
                voxels that activate at least one study will be kept.
        """

        self.dataset = dataset
        mt = dataset.image_table
        self.selected_ids = list(set(mt.ids) & set(ids))
        self.selected_id_indices = np.in1d(mt.ids, ids)

        # If ids2 is provided, we only use mappables explicitly in either ids or ids2.
        # Otherwise, all mappables not in the ids list are used as the control
        # condition.
        unselected_id_indices = ~self.selected_id_indices if ids2 is None \
            else np.in1d(mt.ids, ids2)
        mappable_id_indices = self.selected_id_indices | unselected_id_indices

        # Calculate different count variables
        logger.debug("Calculating counts...")
        n_selected = len(self.selected_ids)
        n_unselected = np.sum(unselected_id_indices)
        n_mappables = n_selected + n_unselected

        n_selected_active_voxels = mt.data.dot(self.selected_id_indices)
        n_unselected_active_voxels = mt.data.dot(unselected_id_indices)

        # Nomenclature for variables below: p = probability, F = feature present, g = given,
        # U = unselected, A = activation. So, e.g., pAgF = p(A|F) = probability of activation
        # in a voxel if we know that the feature is present in a study.
        pF = (n_selected * 1.0) / n_mappables
        mappable_data = mt.data if ids2 is None else mt.data[:, mappable_id_indices]
        pA = np.array((mappable_data.sum(axis=1) * 1.0) / n_mappables).squeeze()

        # Conditional probabilities
        logger.debug("Calculating conditional probabilities...")
        pAgF = n_selected_active_voxels * 1.0 / n_selected
        pAgU = n_unselected_active_voxels * 1.0 / n_unselected
        pFgA = pAgF * pF / pA

        # Recompute conditionals with uniform prior
        logger.debug("Recomputing with uniform priors...")
        pA_prior = prior * pAgF + (1 - prior) * pAgU
        pFgA_prior = pAgF * prior / pA_prior

        def p_to_z(p, sign):
            p = p / 2  # convert to two-tailed
            # prevent underflow
            p[p < 1e-240] = 1e-240
            # Convert to z and assign tail
            z = np.abs(norm.ppf(p)) * sign
            # Set very large z's to max precision
            z[np.isinf(z)] = norm.ppf(1e-240) * -1
            return z

        # One-way chi-square test for consistency of activation
        p_vals = stats.one_way(
            np.squeeze(n_selected_active_voxels), n_selected)
        p_vals[p_vals < 1e-240] = 1e-240
        z_sign = np.sign(
            n_selected_active_voxels - np.mean(
                n_selected_active_voxels)).ravel()
        pAgF_z = p_to_z(p_vals, z_sign)
        fdr_thresh = stats.fdr(p_vals, q)
        pAgF_z_FDR = imageutils.threshold_img(
            pAgF_z, fdr_thresh, p_vals, mask_out='above')

        # Two-way chi-square for specificity of activation
        cells = np.squeeze(
            np.array([[n_selected_active_voxels, n_unselected_active_voxels],
                      [n_selected - n_selected_active_voxels, n_unselected -
                       n_unselected_active_voxels]]).T)
        p_vals = stats.two_way(cells)
        p_vals[p_vals < 1e-240] = 1e-240
        z_sign = np.sign(pAgF - pAgU).ravel()
        pFgA_z = p_to_z(p_vals, z_sign)
        fdr_thresh = stats.fdr(p_vals, q)
        pFgA_z_FDR = imageutils.threshold_img(
            pFgA_z, fdr_thresh, p_vals, mask_out='above')

        # Retain any images we may want to save or access later
        self.images = {
            'pA': pA,
            'pAgF': pAgF,
            'pFgA': pFgA,
            ('pA_given_pF=%0.2f' % prior): pA_prior,
            ('pFgA_given_pF=%0.2f' % prior): pFgA_prior,
            'uniformity-test_z': pAgF_z,
            'association-test_z': pFgA_z,
            ('uniformity-test_z_FDR_%s' % q): pAgF_z_FDR,
            ('association-test_z_FDR_%s' % q): pFgA_z_FDR
        }

        # Mask out all voxels below num_studies threshold
        if min_studies > 0:
            if isinstance(min_studies, int):
                min_studies = float(
                    min_studies) / n_mappables  # Recalculate as proportion
            vox_to_exclude = np.where(pA < min_studies)[0]  # Create mask
            # Mask each image
            for k in self.images:
                self.images[k][vox_to_exclude] = 0