Python get_xlmhg_test_result Examples, xlmhg.get_xlmhg_test_result Python Examples

Example #1

0

Show file

File: test_advanced_api.py Project: flo-compbio/xlmhg

def test_params(my_N, my_ind, my_v):
    result = get_xlmhg_test_result(my_N, my_ind, X=1)
    assert isinstance(result, mHGResult)
    result = get_xlmhg_test_result(my_N, my_ind, L=my_N)
    assert isinstance(result, mHGResult)
    result = get_xlmhg_test_result(my_N, my_ind, pval_thresh=0.05)
    assert isinstance(result, mHGResult)
    table = np.empty((my_N + 1, my_N + 1), np.longdouble)
    result = get_xlmhg_test_result(my_N, my_ind, table=table)
    assert isinstance(result, mHGResult)

Example #2

0

Show file

File: test_visualize.py Project: flo-compbio/xlmhg

def test_figure(tmpdir):

    v = np.uint8([1,0,1,1,0,1] + [0]*12 + [1,0])
    X = 3
    L = 10

    N = v.size
    indices = np.uint16(np.nonzero(v)[0])

    result = xlmhg.get_xlmhg_test_result(N, indices, X=X, L=L)

    fig = xlmhg.get_result_figure(result)
    output_file = text(tmpdir.join('plot1.html'))
    plot(fig, filename=output_file, auto_open=False)
    assert os.path.isfile(output_file)

    fig = xlmhg.get_result_figure(result, width=500, height=350)
    output_file = text(tmpdir.join('plot2.html'))
    plot(fig, filename=output_file, auto_open=False)
    assert os.path.isfile(output_file)

    fig = xlmhg.get_result_figure(result, show_title=True, show_inset=False)
    output_file = text(tmpdir.join('plot3.html'))
    plot(fig, filename=output_file, auto_open=False)
    assert os.path.isfile(output_file)

Example #3

0

Show file

File: test_advanced_api.py Project: flo-compbio/xlmhg

def test_significant2(my_N, my_ind):
    """Test if we return the exact p-value for a significant test when
    requested, even if we don't need to calculate it in order to determine that
    the test is significant."""
    res = get_xlmhg_test_result(my_N, my_ind, pval_thresh=0.045,
                                exact_pval='if_significant')
    assert res.pval == 0.0244453044375645

Example #4

0

Show file

File: test_advanced_api.py Project: flo-compbio/xlmhg

def test_limit_pval(my_incredible_pval_v):
    """Test differential accuracy of PVAL1 and PVAL2."""
    # PVAL1 algorithm should handle this without problems
    N = my_incredible_pval_v.size
    ind = np.uint16(np.nonzero(my_incredible_pval_v)[0])
    res = get_xlmhg_test_result(N, ind)
    assert res.stat == 1.5112233509292993e-216
    assert res.cutoff == 200
    assert res.pval == res.stat

    res = get_xlmhg_test_result(N, ind, use_alg1=True)
    # PVAL2 algorithm should report an invalid p-value
    # (either <= 0 or unrealistically large; in this case < 0)
    # and the front-end should replace that with the O(1)-bound
    assert res.stat == 1.5112233509292993e-216
    assert res.cutoff == 200
    assert res.stat < res.pval < 1e-200

Example #5

0

Show file

File: conftest.py Project: flo-compbio/genometools

def my_rank_based_result(my_v, my_ranked_genes, my_gene_set):
    N = my_v.size
    X = 1
    L = N
    indices = np.uint16(np.nonzero(my_v)[0])
    ind_genes = [my_ranked_genes[i] for i in indices]
    assert set(ind_genes) == my_gene_set.genes
    ## stat, n_star, pval = xlmhg_test(my_v, X, L)
    res = get_xlmhg_test_result(N, indices, X, L)
    # result = GSEResult(stat, n_star, pval, N, X, L, my_gene_set,
    #                   sel, sel_genes)
    result = RankBasedGSEResult(my_gene_set, N, indices, ind_genes, X, L, res.stat, res.cutoff, res.pval)
    return result

Example #6

0

Show file

File: conftest.py Project: wkmwj123/genometools

def my_rank_based_result(my_v, my_ranked_genes, my_gene_set):
    N = my_v.size
    X = 1
    L = N
    indices = np.uint16(np.nonzero(my_v)[0])
    ind_genes = [my_ranked_genes[i] for i in indices]
    assert set(ind_genes) == my_gene_set.genes
    ## stat, n_star, pval = xlmhg_test(my_v, X, L)
    res = get_xlmhg_test_result(N, indices, X, L)
    #result = GSEResult(stat, n_star, pval, N, X, L, my_gene_set,
    #                   sel, sel_genes)
    result = RankBasedGSEResult(my_gene_set, N, indices, ind_genes, X, L,
                                res.stat, res.cutoff, res.pval)
    return result

Example #7

0

Show file

File: conftest.py Project: milescsmith/gopca

def my_rank_based_result(my_matrix, my_v):

    indices = np.uint16(np.nonzero(my_v)[0])
    ind_genes = [my_matrix.genes[i] for i in indices]
    gs_genes = list(ind_genes)
    gene_set = GeneSet(genes=gs_genes, id='Random1', name='Random gene Set 1')
    N = my_v.size
    X = 1
    L = N
    ## stat, n_star, pval = xlmhg_test(my_v, X, L)
    res = get_xlmhg_test_result(N, indices, X, L)
    result = RankBasedGSEResult(gene_set, N, indices, ind_genes, X, L,
                                res.stat, res.cutoff, res.pval)
    return result

Example #8

0

Show file

File: test_visualize.py Project: flo-compbio/xlmhg

def test_figure_double_axis(tmpdir):

    v = np.uint8([1,0,1,1,0,1] + [0]*12 + [1,0])
    X = 3
    L = 10

    N = v.size
    indices = np.uint16(np.nonzero(v)[0])

    result = xlmhg.get_xlmhg_test_result(N, indices, X=X, L=L)

    fig = xlmhg.get_result_figure(result, plot_fold_enrichment=True)
    output_file = text(tmpdir.join('plot4.html'))
    plot(fig, filename=output_file, auto_open=False)
    assert os.path.isfile(output_file)

Example #9

0

Show file

File: test_advanced_api.py Project: flo-compbio/xlmhg

def test_significant1(my_N, my_ind):
    """Test if we return the exact p-value for a significant test when
    requested."""
    res = get_xlmhg_test_result(my_N, my_ind, pval_thresh=0.07,
                                exact_pval='if_significant')
    assert res.pval == 0.0244453044375645

Example #10

0

Show file

File: test_advanced_api.py Project: flo-compbio/xlmhg

def test_ONbound(my_N, my_ind):
    """Test if we return the O(N)-bound instead of the exact
    p-value, if the bound is equal to or smaller than `pval_thresh`."""
    res = get_xlmhg_test_result(my_N, my_ind, pval_thresh=0.045,
                                exact_pval='if_necessary')
    assert res.pval == 0.04179566563467492

Example #11

0

Show file

File: test_advanced_api.py Project: flo-compbio/xlmhg

def test_O1bound(my_N, my_ind):
    """Test if we return the O(1)-bound if that's "<=" `pval_thresh`"""
    res = get_xlmhg_test_result(my_N, my_ind, pval_thresh=0.07,
                                exact_pval='if_necessary')
    assert res.pval == 0.0696594427244582

Example #12

0

Show file

File: test_advanced_api.py Project: flo-compbio/xlmhg

def test_alg1(my_N, my_ind):
    """Test if we can use PVAL1 to calculate p-value."""
    res = get_xlmhg_test_result(my_N, my_ind, use_alg1=True)
    assert res.stat == 0.01393188854489164
    assert res.cutoff == 6
    assert res.pval == 0.0244453044375645

Example #13

0

Show file

File: test_simple_api.py Project: flo-compbio/xlmhg

def test_result(my_ind, my_v):
    N = my_v.size
    result = get_xlmhg_test_result(N, my_ind)
    assert isinstance(result, mHGResult)

Example #14

0

Show file

File: test_advanced_api.py Project: flo-compbio/xlmhg

def test_lowerbound(my_N, my_ind):
    """Test if we return the O(1)-bound when stat > pval_thresh."""
    res = get_xlmhg_test_result(my_N, my_ind, pval_thresh=0.01,
                                exact_pval='if_necessary')
    assert res.pval == 0.0696594427244582

Example #15

0

Show file

File: analysis.py Project: flo-compbio/genometools

    def get_rank_based_enrichment(
            self, ranked_genes, pval_thresh=0.05,
            X_frac=0.25, X_min=5, L=None,
            adjust_pval_thresh=True, escore_pval_thresh=None,
            exact_pval='always', gene_set_ids=None, table=None):
        """Test for gene set enrichment at the top of a ranked list of genes.

        This function uses the XL-mHG test to identify enriched gene sets.

        This function also calculates XL-mHG E-scores for the enriched gene
        sets, using ``escore_pval_thresh`` as the p-value threshold "psi".

        Parameters
        ----------
        ranked_genes : list of str
            The ranked list of genes.
        pval_thresh : float, optional
            The p-value threshold used to determine significance.
            See also ``adjust_pval_thresh``. [0.05]
        X_frac : float, optional
            The min. fraction of genes from a gene set required for enrichment. [0.25]
        X_min : int, optional
            The min. no. of genes from a gene set required for enrichment. [5]
        L : int, optional
            The lowest cutoff to test for enrichment. If ``None``,
            int(0.25*(no. of genes)) will be used. [None]
        adjust_pval_thresh : bool, optional
            Whether to adjust the p-value thershold for multiple testing,
            using the Bonferroni method. [True]
        escore_pval_thresh : float or None, optional
            The "psi" p-value threshold used in calculating E-scores. [None]
        exact_pval : str
            Choices are: "always", "if_significant", "if_necessary". Parameter
            will be passed to `xlmhg.get_xlmhg_test_result`. ["always"]
        gene_set_ids : list of str or None, optional
            A list of gene set IDs to specify which gene sets should be tested for enrichment. If ``None``, all gene sets will be tested. [None]
        table : 2-dim numpy.ndarray of type numpy.longdouble or None, optional
            The dynamic programming table used by the algorithm for calculating XL-mHG p-values. Passing this avoids memory re-allocation when calling this function repetitively. [None]

        Returns
        -------
        list of `RankBasedGSEResult`
            A list of all significantly enriched gene sets. 
        """
        if isinstance(X_frac, (int, np.integer)):
            X_frac = float(X_frac)

        # type checks
        assert isinstance(ranked_genes, Iterable)
        assert isinstance(pval_thresh, (float, np.float))
        assert isinstance(X_frac, (float, np.float))
        assert isinstance(X_min, (int, np.integer))
        if L is not None:
            assert isinstance(L, (int, np.integer))
        assert isinstance(adjust_pval_thresh, bool)
        assert isinstance(exact_pval, (str, _oldstr))

        if escore_pval_thresh is not None:
            assert isinstance(escore_pval_thresh, (float, np.float))
        if gene_set_ids is not None:
            assert isinstance(gene_set_ids, Iterable)
        if table is not None:
            assert isinstance(table, np.ndarray) and \
                   np.issubdtype(table.dtype, np.longdouble)

        if L is None:
            L = int(len(ranked_genes)/4.0)

        gene_set_coll = self._gene_set_coll
        gene_memberships = self._gene_memberships

        # postpone this
        if escore_pval_thresh is None:
            # if no separate E-score p-value threshold is specified, use the
            # p-value threshold (this results in conservative E-scores)
            logger.warning('No E-score p-value threshold supplied. '
                           'The E-score p-value threshold will be set to the'
                           'global significance threshold. This will result '
                           'in conservative E-scores.')

        # test only some terms?
        if gene_set_ids is not None:
            gs_indices = np.int64([self._gene_set_coll.index(id_)
                                   for id_ in gene_set_ids])
            gene_sets = [gene_set_coll[id_] for id_ in gene_set_ids]
            gene_set_coll = GeneSetCollection(gene_sets)
            gene_memberships = gene_memberships[:, gs_indices]  # not a view!

        # reorder rows in annotation matrix to match the given gene ranking
        # also exclude genes not in the ranking
        unknown = 0
        L_adj = L
        sel = []
        filtered_genes = []
        logger.debug('Looking up indices for %d genes...' % len(ranked_genes))
        for i, g in enumerate(ranked_genes):
            assert isinstance(g, (str, _oldstr))
            try:
                idx = self._genome.index(g)
            except ValueError:
                unknown += 1
                # adjust L if the gene was above the original L cutoff
                if i < L:
                    L_adj -= 1
            else:
                sel.append(idx)
                filtered_genes.append(g)
        sel = np.int64(sel)
        logger.debug('Adjusted L: %d', L_adj)

        # the following also copies the data (not a view)
        gene_memberships = gene_memberships[sel, :]
        N, m = gene_memberships.shape
        if unknown > 0:
            # Some genes in the ranked list were unknown (i.e., not present in
            # the specified genome).
            logger.warn('%d / %d unknown genes (%.1f %%), will be ignored.',
                        unknown, len(ranked_genes),
                        100 * (unknown / float(len(ranked_genes))))


        # Determine the number of gene set genes above the L'th cutoff,
        # for all gene sets. This quantity is useful, because we don't need
        # to perform any more work for gene sets that have less than X genes
        # above the cutoff.
        k_above_L = np.sum(gene_memberships[:L_adj, :], axis=0, dtype=np.int64)

        # Determine the number of genes below the L'th cutoff, for all gene
        # sets.
        k_below_L = np.sum(gene_memberships[L_adj:, :], axis=0, dtype=np.int64)

        # Determine the total number K of genes in each gene set that are
        # present in the ranked list (this is equal to k_above_L + k_below_L)
        K_vec = k_above_L + k_below_L

        # Determine the largest K across all gene sets.
        K_max = np.amax(K_vec)

        # Determine X for all gene sets.
        X = np.amax(
            np.c_[np.tile(X_min, m), np.int64(np.ceil(X_frac * K_vec))],
            axis=1)

        # Determine the number of tests (we do not conduct a test if the
        # total number of gene set genes in the ranked list is below X).
        num_tests = np.sum(K_vec-X >= 0)
        logger.info('Conducting %d tests.', num_tests)

        # determine Bonferroni-corrected p-value, if desired
        final_pval_thresh = pval_thresh
        if adjust_pval_thresh and num_tests > 0:
            final_pval_thresh /= float(num_tests)
            logger.info('Using Bonferroni-corrected p-value threshold: %.1e',
                        final_pval_thresh)

        if escore_pval_thresh is None:
            escore_pval_thresh = final_pval_thresh

        elif escore_pval_thresh < final_pval_thresh:
            logger.warning('The E-score p-value threshold is smaller than '
                           'the p-value threshold. Setting E-score p-value '
                           'threshold to the p-value threshold.')
            escore_pval_thresh = final_pval_thresh

        # Prepare the matrix that holds the dynamic programming table for
        # the calculation of the XL-mHG p-value.
        if table is None:
            table = np.empty((K_max+1, N+1), dtype=np.longdouble)
        else:
            if table.shape[0] < K_max+1 or table.shape[1] < N+1:
                raise ValueError(
                    'The supplied array is too small (%d x %d) to hold the '
                    'entire dynamic programming table. The required size is'
                    '%d x %d (rows x columns).'
                    % (table.shape[0], table.shape[1], K_max+1, N+1))

        # find enriched GO terms
        # logger.info('Testing %d gene sets for enrichment...', m)
        logger.debug('(N=%d, X_frac=%.2f, X_min=%d, L=%d; K_max=%d)',
                     len(ranked_genes), X_frac, X_min, L, K_max)

        enriched = []
        num_tests = 0  # number of tests conducted
        for j in range(m):
            # determine gene set-specific value for X
            X = max(X_min, int(ceil(X_frac * float(K_vec[j]))))

            # Determine significance of gene set enrichment using the XL-mHG
            # test (only if there are at least X gene set genes in the list).
            if K_vec[j] >= X:
                num_tests += 1

                # We only need to perform the XL-mHG test if there are enough
                # gene set genes above the L'th cutoff (otherwise, pval = 1.0).
                if k_above_L[j] >= X:
                    # perform test

                    # Determine the ranks of the gene set genes in the
                    # ranked list.
                    indices = np.uint16(np.nonzero(gene_memberships[:, j])[0])
                    res = xlmhg.get_xlmhg_test_result(
                        N, indices, X, L, pval_thresh=final_pval_thresh,
                        escore_pval_thresh=escore_pval_thresh,
                        exact_pval=exact_pval, table=table)

                    # check if gene set is significantly enriched
                    if res.pval <= final_pval_thresh:
                        # generate RankedGSEResult
                        ind_genes = [ranked_genes[i] for i in indices]
                        gse_result = RankBasedGSEResult(
                            gene_set_coll[j], N, indices, ind_genes,
                            X, L, res.stat, res.cutoff, res.pval,
                            escore_pval_thresh=escore_pval_thresh
                        )
                        enriched.append(gse_result)

        # report results
        q = len(enriched)
        ignored = m - num_tests
        if ignored > 0:
            logger.debug('%d / %d gene sets (%.1f%%) had less than X genes '
                         'annotated with them and were ignored.',
                         ignored, m, 100 * (ignored / float(m)))

        logger.info('%d / %d gene sets were found to be significantly '
                    'enriched (p-value <= %.1e).', q, m, final_pval_thresh)

        return enriched

Example #16

0

Show file

def batch_xlmhg(marker_exp, c_list, coi, X=None, L=None):
    """Applies XL-mHG test to a gene expression matrix, gene by gene.

    Outputs a 3-column DataFrame representing statistical results of XL-mHG.

    :param marker_exp: A DataFrame whose rows are cell identifiers, columns are
        gene identifiers, and values are float values representing gene
        expression.
    :param c_list: A Series whose indices are cell identifiers, and whose
        values are the cluster which that cell is part of.
    :param coi: The cluster of interest.
    :param X: An integer to be used as argument to the XL-mHG test.
    :param L: An integer to be used as argument to the XL-mHG test.

    :returns: A matrix with arbitrary row indices, whose columns are the gene
              name, stat, cutoff, and pval outputs of the XL-mHG test; of
              float, int, and float type respectively.  Their names are 'gene',
              'mHG_stat', 'mHG_cutoff', and 'mHG_pval'.

    :rtype: pandas.DataFrame
    """
    # * 1 converts to integer
    mem_list = (c_list == coi) * 1
    #count_n = 0
    #count_2n = 0
    #Count the number of cells in the cluster, store into count_n
    count_n = np.sum(mem_list)
    '''
    for cell in mem_list:
        if cell == 1:
            count_n = count_n + 1
        else:
            continue
    '''
    #Twice the number of cells in the cluster
    #count_2n = count_n * 2
    #Set X and L params
    if X is None:
        X = np.int(.15 * count_n)
    else:
        X = np.int(X * count_n)
    if L is None:
        if 2 * count_n >= marker_exp.shape[0]:
            L = np.int(marker_exp.shape[0])
        else:
            L = np.int(2 * count_n)
    #L = marker_exp.shape[0]
    print('X = ' + str(X))
    print('L = ' + str(L))
    print('Cluster size ' + str(count_n))
    xlmhg = marker_exp.apply(lambda col: hg.xlmhg_test(mem_list.reindex(
        col.sort_values(ascending=False).index).values,
                                                       X=X,
                                                       L=L))
    xlmhg_1 = marker_exp.apply(lambda col: hg.get_xlmhg_test_result(
        N=len(mem_list),
        indices=np.array(np.where(
            np.array(
                mem_list.reindex(col.sort_values(ascending=False).index).values
                == 1))[0],
                         dtype='int64').astype('uint16'),
        X=X,
        L=L,
        pval_thresh=1e-12,
        escore_pval_thresh=1e-1,
        tol=1e-12).escore)
    output = pd.DataFrame()
    output['gene_1'] = xlmhg.index
    output[['mHG_stat', 'mHG_cutoff', 'mHG_pval'
            ]] = pd.DataFrame(xlmhg.values.tolist(),
                              columns=['mHG_stat', 'mHG_cutoff', 'mHG_pval'])
    '''
    print(xlmhg_1)
    output['escore'] = np.array(xlmhg_1,dtype=float)
    output.fillna(0,inplace=True)
    print(output.sort_values(by='escore', ascending=False))
    time.sleep(1000)
    '''
    return output

Example #17

0

Show file

File: test_advanced_api.py Project: flo-compbio/xlmhg

def test_pval_necessary(my_N, my_ind):
    """Test if we return the p-value when it is necessary."""
    res = get_xlmhg_test_result(my_N, my_ind, pval_thresh=0.04,
                                exact_pval='if_necessary')
    assert res.pval == 0.0244453044375645

Example #18

0

Show file

File: test_advanced_api.py Project: flo-compbio/xlmhg

def test_non_contiguous(my_N, my_ind):
    """Test if non-contiguous arrays result in a ValueError."""
    with pytest.raises(ValueError):
        result = get_xlmhg_test_result(my_N, my_ind[::-1])

Example #19

0

Show file

File: test_advanced_api.py Project: flo-compbio/xlmhg

def test_table_too_small(my_N, my_ind):
    """Test if ValueError is raised when the supplied array is too small."""
    K = my_ind.size
    with pytest.raises(ValueError):
        table = np.empty(((my_N-K), (my_N-K)), np.longdouble)
        result = get_xlmhg_test_result(my_N, my_ind, table=table)

Example #20

0

Show file

    def get_rank_based_enrichment(
            self,
            ranked_genes: List[str],
            pval_thresh: float = 0.05,
            X_frac: float = 0.25,
            X_min: int = 5,
            L: int = None,
            adjust_pval_thresh: bool = True,
            escore_pval_thresh: float = None,
            exact_pval: str = 'always',
            gene_set_ids: List[str] = None,
            table: np.ndarray = None) -> RankBasedGSEResult:
        """Test for gene set enrichment at the top of a ranked list of genes.

        This function uses the XL-mHG test to identify enriched gene sets.

        This function also calculates XL-mHG E-scores for the enriched gene
        sets, using ``escore_pval_thresh`` as the p-value threshold "psi".

        Parameters
        ----------
        ranked_gene_ids : list of str
            The ranked list of gene IDs.
        pval_thresh : float, optional
            The p-value threshold used to determine significance.
            See also ``adjust_pval_thresh``. [0.05]
        X_frac : float, optional
            The min. fraction of genes from a gene set required for enrichment. [0.25]
        X_min : int, optional
            The min. no. of genes from a gene set required for enrichment. [5]
        L : int, optional
            The lowest cutoff to test for enrichment. If ``None``,
            int(0.25*(no. of genes)) will be used. [None]
        adjust_pval_thresh : bool, optional
            Whether to adjust the p-value thershold for multiple testing,
            using the Bonferroni method. [True]
        escore_pval_thresh : float or None, optional
            The "psi" p-value threshold used in calculating E-scores. If
            ``None``, will be set to p-value threshold. [None]
        exact_pval : str
            Choices are: "always", "if_significant", "if_necessary". Parameter
            will be passed to `xlmhg.get_xlmhg_test_result`. ["always"]
        gene_set_ids : list of str or None, optional
            A list of gene set IDs to specify which gene sets should be tested for enrichment. If ``None``, all gene sets will be tested. [None]
        table : 2-dim numpy.ndarray of type numpy.longdouble or None, optional
            The dynamic programming table used by the algorithm for calculating XL-mHG p-values. Passing this avoids memory re-allocation when calling this function repetitively. [None]

        Returns
        -------
        list of `RankBasedGSEResult`
            A list of all significantly enriched gene sets. 
        """

        # make sure X_frac is a float (e.g., if specified as 0)
        X_frac = float(X_frac)

        if table is not None:
            if not np.issubdtype(table.dtype, np.longdouble):
                raise TypeError('The provided array for storing the dynamic '
                                'programming table must be of type '
                                '"longdouble"!')

        if L is None:
            L = int(len(ranked_genes) / 4.0)

        gene_set_coll = self._gene_set_coll
        gene_memberships = self._gene_memberships

        # postpone this
        if escore_pval_thresh is None:
            # if no separate E-score p-value threshold is specified, use the
            # p-value threshold (this results in conservative E-scores)
            logger.warning('No E-score p-value threshold supplied. '
                           'The E-score p-value threshold will be set to the'
                           'global significance threshold. This will result '
                           'in conservative E-scores.')

        # test only some terms?
        if gene_set_ids is not None:
            gs_indices = np.int64(
                [self._gene_set_coll.index(id_) for id_ in gene_set_ids])
            gene_sets = [gene_set_coll[id_] for id_ in gene_set_ids]
            gene_set_coll = GeneSetCollection(gene_sets)
            gene_memberships = gene_memberships[:, gs_indices]  # not a view!

        # reorder rows in annotation matrix to match the given gene ranking
        # also exclude genes not in the ranking
        unknown = 0
        L_adj = L
        sel = []
        filtered_genes = []
        logger.debug('Looking up indices for %d genes...' % len(ranked_genes))
        for i, g in enumerate(ranked_genes):
            try:
                idx = self._gene_indices[g]
            except KeyError:
                unknown += 1
                # adjust L if the gene was above the original L cutoff
                if i < L:
                    L_adj -= 1
            else:
                sel.append(idx)
                filtered_genes.append(g)
        sel = np.int64(sel)
        logger.debug('Adjusted L: %d', L_adj)

        # the following also copies the data (not a view)
        gene_memberships = gene_memberships[sel, :]
        N, m = gene_memberships.shape
        if unknown > 0:
            # Some genes in the ranked list were unknown (i.e., not present in
            # the specified genome).
            logger.warn('%d / %d unknown genes (%.1f %%), will be ignored.',
                        unknown, len(ranked_genes),
                        100 * (unknown / float(len(ranked_genes))))

        # Determine the number of gene set genes above the L'th cutoff,
        # for all gene sets. This quantity is useful, because we don't need
        # to perform any more work for gene sets that have less than X genes
        # above the cutoff.
        k_above_L = np.sum(gene_memberships[:L_adj, :], axis=0, dtype=np.int64)

        # Determine the number of genes below the L'th cutoff, for all gene
        # sets.
        k_below_L = np.sum(gene_memberships[L_adj:, :], axis=0, dtype=np.int64)

        # Determine the total number K of genes in each gene set that are
        # present in the ranked list (this is equal to k_above_L + k_below_L)
        K_vec = k_above_L + k_below_L

        # Determine the largest K across all gene sets.
        K_max = np.amax(K_vec)

        # Determine X for all gene sets.
        X = np.amax(np.c_[np.tile(X_min, m),
                          np.int64(np.ceil(X_frac * K_vec))],
                    axis=1)

        # Determine the number of tests (we do not conduct a test if the
        # total number of gene set genes in the ranked list is below X).
        num_tests = np.sum(K_vec - X >= 0)
        logger.info('Conducting %d tests.', num_tests)

        # determine Bonferroni-corrected p-value, if desired
        final_pval_thresh = pval_thresh
        if adjust_pval_thresh and num_tests > 0:
            final_pval_thresh /= float(num_tests)
            logger.info('Using Bonferroni-corrected p-value threshold: %.1e',
                        final_pval_thresh)

        if escore_pval_thresh is None:
            escore_pval_thresh = final_pval_thresh

        elif escore_pval_thresh < final_pval_thresh:
            logger.warning('The E-score p-value threshold is smaller than '
                           'the p-value threshold. Setting E-score p-value '
                           'threshold to the p-value threshold.')
            escore_pval_thresh = final_pval_thresh

        # Prepare the matrix that holds the dynamic programming table for
        # the calculation of the XL-mHG p-value.
        if table is None:
            table = np.empty((K_max + 1, N + 1), dtype=np.longdouble)
        else:
            if table.shape[0] < K_max + 1 or table.shape[1] < N + 1:
                raise ValueError(
                    'The supplied array is too small (%d x %d) to hold the '
                    'entire dynamic programming table. The required size is'
                    '%d x %d (rows x columns).' %
                    (table.shape[0], table.shape[1], K_max + 1, N + 1))

        # find enriched GO terms
        # logger.info('Testing %d gene sets for enrichment...', m)
        logger.debug('(N=%d, X_frac=%.2f, X_min=%d, L=%d; K_max=%d)',
                     len(ranked_genes), X_frac, X_min, L, K_max)

        enriched = []
        num_tests = 0  # number of tests conducted
        for j in range(m):
            # determine gene set-specific value for X
            X = max(X_min, int(ceil(X_frac * float(K_vec[j]))))

            # Determine significance of gene set enrichment using the XL-mHG
            # test (only if there are at least X gene set genes in the list).
            if K_vec[j] >= X:
                num_tests += 1

                # We only need to perform the XL-mHG test if there are enough
                # gene set genes above the L'th cutoff (otherwise, pval = 1.0).
                if k_above_L[j] >= X:
                    # perform test

                    # Determine the ranks of the gene set genes in the
                    # ranked list.
                    indices = np.uint16(np.nonzero(gene_memberships[:, j])[0])
                    res = xlmhg.get_xlmhg_test_result(
                        N,
                        indices,
                        X,
                        L,
                        pval_thresh=final_pval_thresh,
                        escore_pval_thresh=escore_pval_thresh,
                        exact_pval=exact_pval,
                        table=table)

                    # check if gene set is significantly enriched
                    if res.pval <= final_pval_thresh:
                        # generate RankedGSEResult
                        ind_genes = [ranked_genes[i] for i in indices]
                        gse_result = RankBasedGSEResult(
                            gene_set_coll[j],
                            N,
                            indices,
                            ind_genes,
                            X,
                            L,
                            res.stat,
                            res.cutoff,
                            res.pval,
                            escore_pval_thresh=escore_pval_thresh)
                        enriched.append(gse_result)

        # report results
        q = len(enriched)
        ignored = m - num_tests
        if ignored > 0:
            logger.debug(
                '%d / %d gene sets (%.1f%%) had less than X genes '
                'annotated with them and were ignored.', ignored, m,
                100 * (ignored / float(m)))

        logger.info(
            '%d / %d gene sets were found to be significantly '
            'enriched (p-value <= %.1e).', q, m, final_pval_thresh)

        return enriched