Ejemplo n.º 1
0
def go_gene_sets(tax_id: str) -> None:
    domain = 'go'
    ontology = go.Ontology(filename=f'{data_path}/{domain}/gene_ontology.obo')
    annotations = go.Annotations(tax_id,
                                 filename=f'{data_path}/{domain}/{tax_id}.tab',
                                 ontology=ontology)

    def to_gene_set(term: go.Term) -> Optional[GeneSet]:
        genes = annotations.get_genes_by_go_term(term.id)

        if len(genes) > 0:
            return GeneSet(
                gs_id=term.id,
                name=term.name,
                genes=set(genes),
                hierarchy=('GO', term.namespace),
                organism=tax_id,
                link=f'http://amigo.geneontology.org/amigo/term/{term.id}')

    gene_sets = GeneSets([
        gs for gs in [to_gene_set(term) for term in ontology.terms.values()]
        if gs is not None
    ])

    for gs_group in gene_sets.split_by_hierarchy():
        hierarchy = gs_group.common_hierarchy()
        gs_group.to_gmt_file_format(
            f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
Ejemplo n.º 2
0
def dicty_mutant_gene_sets(org):
    """ Return dicty mutant phenotype gene sets from Dictybase
    """
    if org == '352472':
        gene_sets = []
        gene_matcher = GeneMatcher('352472')

        for phenotype, mutants in dicty.phenotypes.phenotype_mutants().items():

            gene_symbols = [
                dicty.phenotypes.mutant_genes(mutant)[0] for mutant in mutants
            ]
            gene_matcher.genes = gene_symbols
            gene_matcher.run_matcher()
            genes = []

            for gene in gene_matcher.genes:
                if gene.ncbi_id is not None:
                    genes.append(int(gene.ncbi_id))

            if len(gene_symbols) != len(genes):
                print(len(gene_symbols), len(genes))

            gs = GeneSet(gs_id=phenotype,
                         name=phenotype,
                         genes=genes,
                         hierarchy=('Dictybase', 'Phenotypes'),
                         organism='352472',
                         link='')

            gene_sets.append(gs)

        return GeneSets(gene_sets)
Ejemplo n.º 3
0
def cytoband_gene_sets(tax_id: str) -> None:
    """ Create cytoband gene sets from Stanford Microarray Database
    """
    if tax_id == '9606':
        download_link = 'http://statweb.stanford.edu/~tibs/GSA/cytobands-stanford.gmt'
        gene_matcher = GeneMatcher('9606')

        with urlopen(download_link) as stream:
            data = stream.read().splitlines()
            genesets = []

            for band in data:
                b = band.decode().split('\t')
                gene_symbols = b[2:]
                gene_matcher.genes = gene_symbols

                genes = set()
                for gene in gene_matcher.genes:
                    if gene.gene_id is not None:
                        genes.add(gene.gene_id)

                genesets.append(
                    GeneSet(gs_id=b[0],
                            name=b[1],
                            genes=genes if b[2:] else set(),
                            hierarchy=('Cytobands', ),
                            organism='9606',
                            link=''))

        for gs_group in GeneSets(genesets).split_by_hierarchy():
            hierarchy = gs_group.common_hierarchy()
            gs_group.to_gmt_file_format(
                f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
Ejemplo n.º 4
0
def omim_gene_sets(org):
    """ Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
    """
    if org == '9606':
        gene_matcher = GeneMatcher('9606')
        genesets = []

        for disease in omim.diseases():
            gene_symbols = omim.disease_genes(disease)
            gene_matcher.genes = gene_symbols
            gene_matcher.run_matcher()
            genes = []

            for gene in gene_matcher.genes:
                if gene.ncbi_id is not None:
                    genes.append(int(gene.ncbi_id))

            gs = GeneSet(
                gs_id=disease.id,
                name=disease.name,
                genes=genes,
                hierarchy=('OMIM', ),
                organism='9606',
                link=(OMIM_LINK.format(disease.id) if disease.id else None))
            genesets.append(gs)

        return GeneSets(genesets)
Ejemplo n.º 5
0
def kegg_gene_sets(org):
    """ Returns gene sets from KEGG pathways.
    """
    caching.clear_cache()
    kegg_org = kegg.KEGGOrganism(taxonomy.name(org))
    ncbi_id_mapper = kegg_org.kegg_to_ncbi_mapper()
    genesets = []

    for id in kegg_org.pathways():
        pway = kegg.KEGGPathway(id)
        hier = ('KEGG', 'pathways')

        if pway.pathway_attributes():
            kegg_names = kegg_org.get_genes_by_pathway(id)
            mapped_genes = []
            for gene in kegg_names:
                try:
                    mapped_genes.append(ncbi_id_mapper[gene.upper()])
                except KeyError:
                    # some kegg names can not be matched to ncbi ids
                    # they are included in geneset anyway
                    # remove prefix, that specifies kegg organism
                    # mapped_genes.append(gene.split(':')[-1])
                    pass

            gs = GeneSet(gs_id=id,
                         name=pway.title,
                         genes=mapped_genes,
                         hierarchy=hier,
                         organism=org,
                         link=pway.link)
            genesets.append(gs)

    return GeneSets(genesets)
Ejemplo n.º 6
0
def cytoband_gene_sets(org):
    """ Create cytoband gene sets from Stanford Microarray Database
    """
    if org == '9606':
        gene_matcher = GeneMatcher('9606')

        with urlopen(CYTOBAND_DOWNLOAD_LINK) as stream:
            data = stream.read().splitlines()
            genesets = []

            for band in data:
                b = band.decode().split('\t')
                gene_symbols = b[2:]
                gene_matcher.genes = gene_symbols
                gene_matcher.run_matcher()

                genes = []
                for gene in gene_matcher.genes:
                    if gene.ncbi_id is not None:
                        genes.append(int(gene.ncbi_id))

                genesets.append(
                    GeneSet(gs_id=b[0],
                            name=b[1],
                            genes=genes if b[2:] else [],
                            hierarchy=('Cytobands', ),
                            organism='9606',
                            link=''))

            return GeneSets(genesets)
Ejemplo n.º 7
0
def reactome_gene_sets(org):
    """ Prepare human pathways gene sets from reactome pathways
    """
    if org == '9606':
        gene_matcher = GeneMatcher('9606')

        with urlopen(REACTOME_DOWNLOAD_LINK) as url:
            memfile = io.BytesIO(url.read())

            with ZipFile(memfile, 'r') as myzip:
                f = myzip.open(REACTOME_FILE_NAME)
                content = f.read().decode().splitlines()
                genesets = []

                for path in content:
                    gene_symbols = path.split('\t')[2:] if path.split(
                        '\t')[2:] else []
                    gene_matcher.genes = gene_symbols
                    gene_matcher.run_matcher()
                    genes = []

                    for gene in gene_matcher.genes:
                        if gene.ncbi_id is not None:
                            genes.append(int(gene.ncbi_id))

                    gs = GeneSet(gs_id=path.split('\t')[0],
                                 name=path.split('\t')[0],
                                 genes=genes,
                                 hierarchy=('Reactome', 'Pathways'),
                                 organism='9606',
                                 link='')

                    genesets.append(gs)

                return GeneSets(genesets)
Ejemplo n.º 8
0
    def test_gene_sets(self):
        gs1 = GeneSet(
            gs_id=self.test_gs_id,
            name=self.test_name,
            genes=self.test_genes,
            hierarchy=self.test_hierarchy,
            organism=self.test_organism,
            link='',
        )

        gs2 = GeneSet(gs_id='test2',
                      name='test_name2',
                      hierarchy=('Test', 'test'),
                      organism='3702')
        gs3 = GeneSet(gs_id='test3',
                      name='test_name3',
                      hierarchy=('Test', 'test'),
                      organism='3702')

        sets = GeneSets([gs1, gs2, gs3])
        self.assertIsNotNone(sets)

        self.assertRaises(GeneSetException, sets.common_org)
        self.assertRaises(GeneSetException, sets.common_hierarchy)

        self.assertGreater(len(sets.hierarchies()), 1)

        split_by_hierarchy = sets.split_by_hierarchy()
        self.assertLess(len(split_by_hierarchy), len(sets))
Ejemplo n.º 9
0
    def test_gmt_file_format(self):

        gs = GeneSet(
            gs_id=self.test_gs_id,
            name=self.test_name,
            genes=self.test_genes,
            hierarchy=self.test_hierarchy,
            organism=self.test_organism,
            link='',
        )

        fd, file_name = mkstemp()

        # write to file
        write_sets = GeneSets([gs])
        write_sets.to_gmt_file_format(file_name)

        with open(file_name, 'r') as temp_f:
            line = temp_f.readline()
            columns = line.strip().split('\t')
            self.assertGreater(len(columns), 0)

        # read from file
        read_sets = GeneSets.from_gmt_file_format(file_name)
        self.assertIsNotNone(read_sets)
        self.assertGreater(len(read_sets), 0)
        self.assertEqual(read_sets.common_hierarchy(), self.test_hierarchy)
        self.assertEqual(read_sets.common_org(), self.test_organism)

        # clean-up
        os.close(fd)
        os.remove(file_name)
Ejemplo n.º 10
0
def dicty_mutant_gene_sets(tax_id: str):
    """ Return dicty mutant phenotype gene sets from Dictybase
    """
    if tax_id == '44689':
        gene_sets = []
        gene_matcher = GeneMatcher('44689')

        for phenotype, mutants in phenotypes.phenotype_mutants().items():
            phenotype = phenotype.replace(",", " ")
            gene_symbols = [
                phenotypes.mutant_genes(mutant)[0] for mutant in mutants
            ]
            gene_matcher.genes = gene_symbols
            genes = set()

            for gene in gene_matcher.genes:
                if gene.gene_id is not None:
                    genes.add(str(gene.gene_id))

            gs = GeneSet(gs_id=phenotype,
                         name=phenotype,
                         genes=genes,
                         hierarchy=('Dictybase', 'Phenotypes'),
                         organism=tax_id,
                         link='')

            gene_sets.append(gs)

        for gs_group in GeneSets(gene_sets).split_by_hierarchy():
            hierarchy = gs_group.common_hierarchy()
            gs_group.to_gmt_file_format(
                f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
Ejemplo n.º 11
0
    def load_gene_sets(self, tax_id):
        # type: (str) -> None
        self.gs_object = GeneSets()
        self.clear()

        gene_sets = list_all(organism=tax_id)
        self.set_hierarchy_model(self.hierarchy_tree_widget,
                                 self.hierarchy_tree(gene_sets))

        for gene_set in gene_sets:
            g_sets = load_gene_sets(gene_set, tax_id)
            self.gs_object.update([g_set for g_set in g_sets])

        self.set_selected_hierarchies()
Ejemplo n.º 12
0
    def __init__(self, box, parent, settings_var, **kwargs):
        # type: (Union[QGroupBox, QWidget], QWidget, str) -> None
        super().__init__(**kwargs)

        self.parent = parent
        self.stored_selection = settings_var
        # gene sets object
        self.gs_object = GeneSets()  # type: GeneSets

        self.hierarchy_tree_widget = QTreeWidget(self)
        self.hierarchy_tree_widget.setHeaderHidden(True)
        self.hierarchy_tree_widget.setEditTriggers(QTreeView.NoEditTriggers)
        box.layout().addWidget(self.hierarchy_tree_widget)

        self.custom_set_hier = None
        self.default_selection = [('GO', 'molecular_function'),
                                  ('GO', 'biological_process'),
                                  ('GO', 'cellular_component')]
Ejemplo n.º 13
0
def reactome_gene_sets(tax_id: str) -> None:
    """ Prepare human pathways gene sets from reactome pathways
    """
    if tax_id == '9606':
        download_link = 'http://www.reactome.org/download/current/ReactomePathways.gmt.zip'
        file_name = 'ReactomePathways.gmt'
        detail_link = 'https://reactome.org/content/detail/{}'

        gene_matcher = GeneMatcher('9606')

        with urlopen(download_link) as url:
            memfile = io.BytesIO(url.read())

            with ZipFile(memfile, 'r') as myzip:
                f = myzip.open(file_name)
                content = f.read().decode().splitlines()
                genesets = []

                for path in content:
                    gene_symbols = path.split('\t')[2:] if path.split(
                        '\t')[2:] else []
                    gene_matcher.genes = gene_symbols
                    genes = set()

                    for gene in gene_matcher.genes:
                        if gene.gene_id is not None:
                            genes.add(str(gene.gene_id))

                    pathway = path.split('\t')[0].replace(',', ' ')
                    pathway_id = path.split('\t')[1].replace(',', ' ')

                    gs = GeneSet(gs_id=pathway_id,
                                 name=pathway,
                                 genes=genes,
                                 hierarchy=('Reactome', 'pathways'),
                                 organism='9606',
                                 link=detail_link.format(pathway_id))

                    genesets.append(gs)

        for gs_group in GeneSets(genesets).split_by_hierarchy():
            hierarchy = gs_group.common_hierarchy()
            gs_group.to_gmt_file_format(
                f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
def go_gene_sets(org):
    """ Returns gene sets from GO.
    """

    ontology = go.Ontology()
    annotations = go.Annotations(org, ontology=ontology)

    gene_sets = []
    for termn, term in ontology.terms.items():
        genes = annotations.get_genes_by_go_term(termn)
        hier = ('GO', term.namespace)
        if len(genes) > 0:

            gs = GeneSet(gs_id=termn, name=term.name, genes=genes, hierarchy=hier,
                         organism=org, link=GO_TERM_LINK.format(termn))

            gene_sets.append(gs)

    return GeneSets(gene_sets)
Ejemplo n.º 15
0
def kegg_gene_sets(tax_id: str) -> None:
    """ Returns gene sets from KEGG pathways.
    """
    caching.clear_cache()
    kegg_org = kegg.KEGGOrganism(taxonomy.name(tax_id))
    ncbi_id_mapper = kegg_org.kegg_to_ncbi_mapper()
    genesets = []

    for id in kegg_org.pathways():
        pway = kegg.KEGGPathway(id)
        hier = ('KEGG', 'Pathways')

        if pway.pathway_attributes():
            kegg_names = kegg_org.get_genes_by_pathway(id)
            mapped_genes = set()
            for gene in kegg_names:
                try:
                    mapped_genes.add(ncbi_id_mapper[gene.upper()])
                except KeyError:
                    # some kegg names can not be matched to ncbi ids
                    # they are included in geneset anyway
                    # remove prefix, that specifies kegg organism
                    # mapped_genes.append(gene.split(':')[-1])
                    pass

            gs = GeneSet(gs_id=id,
                         name=pway.title,
                         genes=mapped_genes,
                         hierarchy=hier,
                         organism=tax_id,
                         link=pway.link)
            genesets.append(gs)

    for gs_group in GeneSets(genesets).split_by_hierarchy():
        hierarchy = gs_group.common_hierarchy()
        gs_group.to_gmt_file_format(
            f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
Ejemplo n.º 16
0
def gene_marker_sets():
    file_names = ['panglao_gene_markers.tab', 'cellMarker_gene_markers.tab']
    file_name_to_hier = {
        'panglao_gene_markers.tab': 'Panglao',
        'cellMarker_gene_markers.tab': 'CellMarker'
    }

    for file_name in file_names:
        file_path = f'{data_path}/marker_genes/{file_name}'

        sets_by_org = {'9606': defaultdict(list), '10090': defaultdict(list)}
        name_to_tax = {'Human': '9606', 'Mouse': '10090'}

        for row in Table(file_path):
            tax_id = name_to_tax[row['Organism']]
            cell_type = row['Cell Type']
            gene_id = row['Entrez ID']
            sets_by_org[tax_id][cell_type].append(gene_id)

        for tax_id, cell_types in sets_by_org.items():
            gene_sets = []

            for cell_type, genes in cell_types.items():
                gs = GeneSet(
                    gs_id=str(cell_type),
                    name=str(cell_type),
                    genes=set([str(gene) for gene in genes if gene != '?']),
                    hierarchy=('Marker Genes', file_name_to_hier[file_name]),
                    organism=tax_id,
                    link='')

                gene_sets.append(gs)

            for gs_group in GeneSets(gene_sets).split_by_hierarchy():
                hierarchy = gs_group.common_hierarchy()
                gs_group.to_gmt_file_format(
                    f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
Ejemplo n.º 17
0
 def clear_gene_sets(self):
     self.gs_object = GeneSets()