Example #1
0
def my_gene_sets():
    gene_sets = GeneSetCollection([
        GeneSet('GeneSet1',
                'First gene set', ['a', 'b', 'd'],
                source='TestSource',
                collection='TestCollection',
                description='The first test GeneSet.'),
        GeneSet('GeneSet2',
                'Second gene set', ['a', 'c', 'd'],
                source='TestSource',
                collection='TestCollection',
                description='The second test GeneSet.'),
    ])
    return gene_sets
Example #2
0
def my_gene_sets(my_gene_set):
    gene_sets = []
    for i in range(3):
        gs = GeneSet(my_gene_set.id + str(i + 1), my_gene_set.name,
                     my_gene_set.genes)
        gene_sets.append(gs)
    return gene_sets
Example #3
0
def my_gene_set(my_genes):
    gene_set = GeneSet('TestID',
                       'TestName',
                       my_genes,
                       source='TestSource',
                       collection='TestCollection',
                       description='Test GeneSet.')
    return gene_set
Example #4
0
def my_rank_based_result(my_matrix, my_v):

    indices = np.uint16(np.nonzero(my_v)[0])
    ind_genes = [my_matrix.genes[i] for i in indices]
    gs_genes = list(ind_genes)
    gene_set = GeneSet(genes=gs_genes, id='Random1', name='Random gene Set 1')
    N = my_v.size
    X = 1
    L = N
    ## stat, n_star, pval = xlmhg_test(my_v, X, L)
    res = get_xlmhg_test_result(N, indices, X, L)
    result = RankBasedGSEResult(gene_set, N, indices, ind_genes, X, L,
                                res.stat, res.cutoff, res.pval)
    return result
Example #5
0
def my_uninteresting_gene_set(my_ranked_genes):
    """Select the last five ranked genes"""
    genes = my_ranked_genes[-5:]
    gene_set = GeneSet('BoringID', 'boring gene set', genes)
    return gene_set
Example #6
0
def my_gene_set(my_ranked_genes, my_v):
    """Select the genes corresponding to the 1's in ``my_v``."""
    genes = [my_ranked_genes[i] for i in np.nonzero(my_v)[0]]
    gene_set = GeneSet('TestID', 'TestName', genes)
    return gene_set
Example #7
0
    def get_gene_sets(self, min_genes=None, max_genes=None):
        """Return the set of annotated genes for each GO term.

        Parameters
        ----------
        min_genes: int, optional
            Exclude GO terms with fewer than this number of genes.
        max_genes: int, optional
            Exclude GO terms with more than this number of genes.

        Returns
        -------
        GeneSetCollection
            A gene set "database" with one gene set for each GO term.
        """

        if not self.terms:
            raise ValueError('You need to first parse both an OBO file and '
                             'a gene association file!')

        if not self.annotations:
            raise ValueError('You need to first parse a gene association '
                             'file!')

        all_term_ids = sorted(self.terms.keys())

        # go over all GO terms and get associated genes
        logger.info('Obtaining GO term associations...')
        # n = len(all_term_ids)
        # term_gene_counts = []
        # term_ids = []

        term_genes = OrderedDict()
        geneset_terms = {}
        gene_sets = []
        for j, id_ in enumerate(all_term_ids):
            tg = self.get_goterm_genes(id_)
            assert isinstance(tg, frozenset)
            c = len(tg)

            if c == 0:
                continue

            if (min_genes is not None and c < min_genes) or \
                    (max_genes is not None and c > max_genes):
                # term doesn't meet min/max number of genes criteria
                continue

            # for finding redundant terms (use set of genes as key)
            try:
                geneset_terms[tg].append(id_)
            except KeyError:
                geneset_terms[tg] = [id_]

            term_genes[id_] = tg

        selected = len(term_genes)
        affected = 0
        excl = 0
        for id_, tg in term_genes.items():

            # check if there are redundant terms
            term = self.terms[id_]
            if len(geneset_terms[tg]) > 1:
                gt = geneset_terms[tg]
                affected += 1
                # check if this term is an ancestor of any of them
                # if so, exclude it
                excluded = False
                for other_id in gt:
                    if (other_id != id_) and (other_id in term.descendants):
                        excluded = True
                        break
                if excluded:
                    excl += 1
                    continue

            # if the term is not redundant with any other term,
            # or if it isn't the ancestor of any redundant term,
            # add its gene set to the list
            name = term.name
            source = 'GO'
            coll = term.domain_short
            desc = term.definition
            gs = GeneSet(id_, name, tg, source=source,
                         collection=coll, description=desc)
            gene_sets.append(gs)

        D = GeneSetCollection(gene_sets)
        logger.info('# terms selected intially: %d', selected)
        logger.info('# terms with redundant gene sets: %d', affected)
        logger.info('# terms excluded due to redundancy: %d', excl)
        logger.info('# terms retained: %d', D.n)

        return D
Example #8
0
def test_list(my_gene_set):
    l = my_gene_set.to_list()
    assert isinstance(l, list)
    assert len(l) == 6
    other = GeneSet.from_list(l)
    assert other == my_gene_set
Example #9
0
def test_list(my_gene_set):
    l = my_gene_set.to_list()
    assert isinstance(l, list)
    assert len(l) == 6
    other = GeneSet.from_list(l)
    assert other == my_gene_set
Example #10
0
def my_gene_set2(my_genes):
    # a gene set with all optional attributes set to None
    gene_set = GeneSet('TestID', 'TestName', my_genes)
    return gene_set