Beispiel #1
0
def multipletests(pvals, alpha=0.05, method="hs", is_sorted=False):
    """
    Test results and p-value correction for multiple tests.

    Parameters
    ----------
    pvals : array_like
        Uncorrected p-values.
    alpha : float
        FWER, family-wise error rate, e.g. ``0.1``.
    method : string
        Method used for testing and adjustment of pvalues. Can be either the
        full name or initial letters. Available methods are ::

        `bonferroni` : one-step correction
        `sidak` : one-step correction
        `holm-sidak` : step down method using Sidak adjustments
        `holm` : step-down method using Bonferroni adjustments
        `simes-hochberg` : step-up method  (independent)
        `hommel` : closed method based on Simes tests (non-negative)
        `fdr_bh` : Benjamini/Hochberg  (non-negative)
        `fdr_by` : Benjamini/Yekutieli (negative)
        `fdr_tsbh` : two stage fdr correction (non-negative)
        `fdr_tsbky` : two stage fdr correction (non-negative)
    is_sorted : bool
        If ``False`` (default), the p_values will be sorted, but the corrected
        pvalues are in the original order. If ``True``, then it assumed that
        the pvalues are already sorted in ascending order.

    Returns
    -------
    reject : ndarray, boolean
        ``True`` for hypothesis that can be rejected for given alpha.
    pvals_corrected : ndarray
        P-values corrected for multiple tests.
    alphacSidak : float
        Corrected alpha for Sidak method.
    alphacBonf : float
        Corrected alpha for Bonferroni method.

    Notes
    -----
    This is a wrapper around a function from the `statsmodels`_ package.

    .. _statsmodels: http://www.statsmodels.org
    """

    from statsmodels.sandbox.stats.multicomp import multipletests as mt

    return mt(pvals, alpha=alpha, method=method, is_sorted=is_sorted)
Beispiel #2
0
    def filter_low_coverage(self, alpha=0.25):

        use_inds = np.where(self.data['status'] == 0)[0]
        cell = self.data['cell'][use_inds]
        position = self.positions[use_inds]
        rmt = self.data['rmt'][use_inds]
        genes = self.genes[use_inds]

        # A triplet is a (cell, position, rmt) triplet in each gene
        df = pd.DataFrame({
            'gene': genes,
            'cell': cell,
            'position': position,
            'rmt': rmt
        })
        grouped = df.groupby(['gene', 'position'])
        # This gives the gene followed by the number of triplets at each position
        # Summing across each gene will give the number of total triplets in gene
        num_per_position = (grouped['position'].agg(
            {'Num Triplets at Pos': np.count_nonzero})).reset_index()

        # Total triplets in each gene
        trips_in_gene = (num_per_position.groupby([
            'gene'
        ]))['Num Triplets at Pos'].agg({'Num Triplets at Gene': np.sum})

        trips_in_gene = trips_in_gene.reset_index()

        num_per_position = num_per_position.merge(trips_in_gene, how='left')

        # for each (c,rmt) in df check in grouped2 if it is lonely
        # determine number of lonely triplets at each position
        grouped2 = df.groupby(['gene', 'cell', 'rmt'])
        # lonely_triplets = grouped2["position"].apply(lambda x: len(x.unique()))
        # This is a list of each gene, cell, rmt combo and the positions with that criteria
        lonely_triplets = grouped2['position'].apply(np.unique)
        lonely_triplets = pd.DataFrame(lonely_triplets)

        # if the length is one, this is a lonely triplet
        lonely_triplets_u = lonely_triplets['position'].apply(len)
        lonely_triplets_u = pd.DataFrame(lonely_triplets_u)

        lonely_triplets_u = lonely_triplets_u.reset_index()
        lonely_triplets = lonely_triplets.reset_index()

        # Rename the columns
        lonely_triplets = lonely_triplets.rename(
            columns=lambda x: x.replace('position', 'lonely position'))
        lonely_triplets_u = lonely_triplets_u.rename(
            columns=lambda x: x.replace('position', 'num'))

        # merge the column that is the length of the positions array
        # take the ones with length 1
        lonely_triplets = lonely_triplets.merge(lonely_triplets_u, how='left')
        lonely_triplets = lonely_triplets.loc[lonely_triplets.loc[:, 'num'] ==
                                              1, :]

        # This is the gene, cell, rmt combo and the position that is lonely
        # We need to convert the array to a scalar
        scalar = lonely_triplets["lonely position"].apply(np.asscalar)
        lonely_triplets["lonely position"] = scalar
        # Now if we group as such, we can determine how many (c, rmt) paris exist at each position
        # This would be the number of lonely pairs at a position
        grouped3 = lonely_triplets.groupby(["gene", "lonely position"])
        l_num_at_position = (grouped3["cell"].agg(['count'])).reset_index()
        l_num_at_position = l_num_at_position.rename(
            columns=lambda x: x.replace('count', 'lonely triplets at pos'))
        l_num_at_position = l_num_at_position.rename(
            columns=lambda x: x.replace('lonely position', 'position'))
        # lonely pairs in each gene
        l_num_at_gene = (lonely_triplets.groupby(
            ["gene"]))['lonely position'].agg(['count'])
        l_num_at_gene = l_num_at_gene.reset_index()
        l_num_at_gene = l_num_at_gene.rename(
            columns=lambda x: x.replace('count', 'lonely triplets at gen'))

        # aggregate
        total = l_num_at_position.merge(l_num_at_gene, how='left')
        total = total.merge(num_per_position, how='left')

        # scipy hypergeom
        p = total.apply(self._hypergeom_wrapper, axis=1)
        p = 1 - p

        from statsmodels.sandbox.stats.multicomp import multipletests as mt
        adj_p = mt(p, alpha=alpha, method='fdr_bh')

        keep = pd.DataFrame(adj_p[0])
        total['remove'] = keep

        remove = total[total['remove'] == True]

        final = df.merge(remove, how="left")
        final = final[final["remove"] == True]

        # Indicies to remove
        remove_inds = use_inds[final.index.values]

        self.data['status'][remove_inds] |= self.filter_codes['lonely_triplet']
        patients_start_age.append(
            np.min(df.loc[df.index[df.patient_id.str.contains(p)]].age))
    print "age: ", np.mean(patients_start_age), "+/-", np.std(
        patients_start_age)

    groups = np.unique(df.crs_group)
    p_vals = dict()
    for key_id in xrange(8, len(df.columns)):
        key = df.columns[key_id]
        y = [df[key][df.crs_group == score] for score in groups]
        # print stats.ttest_ind(y[0].dropna(), y[2].dropna())
        s = stats.mannwhitneyu(y[1].dropna(), y[2].dropna())
        p_vals[key] = s[1]
        print key, np.mean(y[1]), np.mean(y[2]), ": p-value =", s[1]

    p_fdr = mt(p_vals.values(), alpha=0.05, method='fdr_bh')
    p_vals_fdr = dict()
    for k, key in enumerate(p_vals.keys()):
        p_vals_fdr[key] = p_fdr[1][k]

    #sorted_p = OrderedDict(sorted(p_vals_fdr.items(), key=lambda(k,v):(v,k)))
    sorted_p = sorted(p_vals_fdr.items(), key=operator.itemgetter(1))
    selected_p = [(param, '%.5f' % p_value)
                  for (param, p_value) in sorted_p]  # if p_value < 0.05]
    print selected_p
    param_names = [param for (param, p_value) in selected_p]
    # param_names = param_names[:7]
    param_names.append('crs_group')

    # df.replace('', np.nan, inplace=True)
    # df.drop(["frequency_mse_spindle"], axis=1, inplace=True)