def go_gene_sets(tax_id: str) -> None: domain = 'go' ontology = go.Ontology(filename=f'{data_path}/{domain}/gene_ontology.obo') annotations = go.Annotations(tax_id, filename=f'{data_path}/{domain}/{tax_id}.tab', ontology=ontology) def to_gene_set(term: go.Term) -> Optional[GeneSet]: genes = annotations.get_genes_by_go_term(term.id) if len(genes) > 0: return GeneSet( gs_id=term.id, name=term.name, genes=set(genes), hierarchy=('GO', term.namespace), organism=tax_id, link=f'http://amigo.geneontology.org/amigo/term/{term.id}') gene_sets = GeneSets([ gs for gs in [to_gene_set(term) for term in ontology.terms.values()] if gs is not None ]) for gs_group in gene_sets.split_by_hierarchy(): hierarchy = gs_group.common_hierarchy() gs_group.to_gmt_file_format( f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
def dicty_mutant_gene_sets(org): """ Return dicty mutant phenotype gene sets from Dictybase """ if org == '352472': gene_sets = [] gene_matcher = GeneMatcher('352472') for phenotype, mutants in dicty.phenotypes.phenotype_mutants().items(): gene_symbols = [ dicty.phenotypes.mutant_genes(mutant)[0] for mutant in mutants ] gene_matcher.genes = gene_symbols gene_matcher.run_matcher() genes = [] for gene in gene_matcher.genes: if gene.ncbi_id is not None: genes.append(int(gene.ncbi_id)) if len(gene_symbols) != len(genes): print(len(gene_symbols), len(genes)) gs = GeneSet(gs_id=phenotype, name=phenotype, genes=genes, hierarchy=('Dictybase', 'Phenotypes'), organism='352472', link='') gene_sets.append(gs) return GeneSets(gene_sets)
def cytoband_gene_sets(tax_id: str) -> None: """ Create cytoband gene sets from Stanford Microarray Database """ if tax_id == '9606': download_link = 'http://statweb.stanford.edu/~tibs/GSA/cytobands-stanford.gmt' gene_matcher = GeneMatcher('9606') with urlopen(download_link) as stream: data = stream.read().splitlines() genesets = [] for band in data: b = band.decode().split('\t') gene_symbols = b[2:] gene_matcher.genes = gene_symbols genes = set() for gene in gene_matcher.genes: if gene.gene_id is not None: genes.add(gene.gene_id) genesets.append( GeneSet(gs_id=b[0], name=b[1], genes=genes if b[2:] else set(), hierarchy=('Cytobands', ), organism='9606', link='')) for gs_group in GeneSets(genesets).split_by_hierarchy(): hierarchy = gs_group.common_hierarchy() gs_group.to_gmt_file_format( f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
def omim_gene_sets(org): """ Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses """ if org == '9606': gene_matcher = GeneMatcher('9606') genesets = [] for disease in omim.diseases(): gene_symbols = omim.disease_genes(disease) gene_matcher.genes = gene_symbols gene_matcher.run_matcher() genes = [] for gene in gene_matcher.genes: if gene.ncbi_id is not None: genes.append(int(gene.ncbi_id)) gs = GeneSet( gs_id=disease.id, name=disease.name, genes=genes, hierarchy=('OMIM', ), organism='9606', link=(OMIM_LINK.format(disease.id) if disease.id else None)) genesets.append(gs) return GeneSets(genesets)
def kegg_gene_sets(org): """ Returns gene sets from KEGG pathways. """ caching.clear_cache() kegg_org = kegg.KEGGOrganism(taxonomy.name(org)) ncbi_id_mapper = kegg_org.kegg_to_ncbi_mapper() genesets = [] for id in kegg_org.pathways(): pway = kegg.KEGGPathway(id) hier = ('KEGG', 'pathways') if pway.pathway_attributes(): kegg_names = kegg_org.get_genes_by_pathway(id) mapped_genes = [] for gene in kegg_names: try: mapped_genes.append(ncbi_id_mapper[gene.upper()]) except KeyError: # some kegg names can not be matched to ncbi ids # they are included in geneset anyway # remove prefix, that specifies kegg organism # mapped_genes.append(gene.split(':')[-1]) pass gs = GeneSet(gs_id=id, name=pway.title, genes=mapped_genes, hierarchy=hier, organism=org, link=pway.link) genesets.append(gs) return GeneSets(genesets)
def cytoband_gene_sets(org): """ Create cytoband gene sets from Stanford Microarray Database """ if org == '9606': gene_matcher = GeneMatcher('9606') with urlopen(CYTOBAND_DOWNLOAD_LINK) as stream: data = stream.read().splitlines() genesets = [] for band in data: b = band.decode().split('\t') gene_symbols = b[2:] gene_matcher.genes = gene_symbols gene_matcher.run_matcher() genes = [] for gene in gene_matcher.genes: if gene.ncbi_id is not None: genes.append(int(gene.ncbi_id)) genesets.append( GeneSet(gs_id=b[0], name=b[1], genes=genes if b[2:] else [], hierarchy=('Cytobands', ), organism='9606', link='')) return GeneSets(genesets)
def reactome_gene_sets(org): """ Prepare human pathways gene sets from reactome pathways """ if org == '9606': gene_matcher = GeneMatcher('9606') with urlopen(REACTOME_DOWNLOAD_LINK) as url: memfile = io.BytesIO(url.read()) with ZipFile(memfile, 'r') as myzip: f = myzip.open(REACTOME_FILE_NAME) content = f.read().decode().splitlines() genesets = [] for path in content: gene_symbols = path.split('\t')[2:] if path.split( '\t')[2:] else [] gene_matcher.genes = gene_symbols gene_matcher.run_matcher() genes = [] for gene in gene_matcher.genes: if gene.ncbi_id is not None: genes.append(int(gene.ncbi_id)) gs = GeneSet(gs_id=path.split('\t')[0], name=path.split('\t')[0], genes=genes, hierarchy=('Reactome', 'Pathways'), organism='9606', link='') genesets.append(gs) return GeneSets(genesets)
def test_gene_sets(self): gs1 = GeneSet( gs_id=self.test_gs_id, name=self.test_name, genes=self.test_genes, hierarchy=self.test_hierarchy, organism=self.test_organism, link='', ) gs2 = GeneSet(gs_id='test2', name='test_name2', hierarchy=('Test', 'test'), organism='3702') gs3 = GeneSet(gs_id='test3', name='test_name3', hierarchy=('Test', 'test'), organism='3702') sets = GeneSets([gs1, gs2, gs3]) self.assertIsNotNone(sets) self.assertRaises(GeneSetException, sets.common_org) self.assertRaises(GeneSetException, sets.common_hierarchy) self.assertGreater(len(sets.hierarchies()), 1) split_by_hierarchy = sets.split_by_hierarchy() self.assertLess(len(split_by_hierarchy), len(sets))
def test_gmt_file_format(self): gs = GeneSet( gs_id=self.test_gs_id, name=self.test_name, genes=self.test_genes, hierarchy=self.test_hierarchy, organism=self.test_organism, link='', ) fd, file_name = mkstemp() # write to file write_sets = GeneSets([gs]) write_sets.to_gmt_file_format(file_name) with open(file_name, 'r') as temp_f: line = temp_f.readline() columns = line.strip().split('\t') self.assertGreater(len(columns), 0) # read from file read_sets = GeneSets.from_gmt_file_format(file_name) self.assertIsNotNone(read_sets) self.assertGreater(len(read_sets), 0) self.assertEqual(read_sets.common_hierarchy(), self.test_hierarchy) self.assertEqual(read_sets.common_org(), self.test_organism) # clean-up os.close(fd) os.remove(file_name)
def dicty_mutant_gene_sets(tax_id: str): """ Return dicty mutant phenotype gene sets from Dictybase """ if tax_id == '44689': gene_sets = [] gene_matcher = GeneMatcher('44689') for phenotype, mutants in phenotypes.phenotype_mutants().items(): phenotype = phenotype.replace(",", " ") gene_symbols = [ phenotypes.mutant_genes(mutant)[0] for mutant in mutants ] gene_matcher.genes = gene_symbols genes = set() for gene in gene_matcher.genes: if gene.gene_id is not None: genes.add(str(gene.gene_id)) gs = GeneSet(gs_id=phenotype, name=phenotype, genes=genes, hierarchy=('Dictybase', 'Phenotypes'), organism=tax_id, link='') gene_sets.append(gs) for gs_group in GeneSets(gene_sets).split_by_hierarchy(): hierarchy = gs_group.common_hierarchy() gs_group.to_gmt_file_format( f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
def load_gene_sets(self, tax_id): # type: (str) -> None self.gs_object = GeneSets() self.clear() gene_sets = list_all(organism=tax_id) self.set_hierarchy_model(self.hierarchy_tree_widget, self.hierarchy_tree(gene_sets)) for gene_set in gene_sets: g_sets = load_gene_sets(gene_set, tax_id) self.gs_object.update([g_set for g_set in g_sets]) self.set_selected_hierarchies()
def __init__(self, box, parent, settings_var, **kwargs): # type: (Union[QGroupBox, QWidget], QWidget, str) -> None super().__init__(**kwargs) self.parent = parent self.stored_selection = settings_var # gene sets object self.gs_object = GeneSets() # type: GeneSets self.hierarchy_tree_widget = QTreeWidget(self) self.hierarchy_tree_widget.setHeaderHidden(True) self.hierarchy_tree_widget.setEditTriggers(QTreeView.NoEditTriggers) box.layout().addWidget(self.hierarchy_tree_widget) self.custom_set_hier = None self.default_selection = [('GO', 'molecular_function'), ('GO', 'biological_process'), ('GO', 'cellular_component')]
def reactome_gene_sets(tax_id: str) -> None: """ Prepare human pathways gene sets from reactome pathways """ if tax_id == '9606': download_link = 'http://www.reactome.org/download/current/ReactomePathways.gmt.zip' file_name = 'ReactomePathways.gmt' detail_link = 'https://reactome.org/content/detail/{}' gene_matcher = GeneMatcher('9606') with urlopen(download_link) as url: memfile = io.BytesIO(url.read()) with ZipFile(memfile, 'r') as myzip: f = myzip.open(file_name) content = f.read().decode().splitlines() genesets = [] for path in content: gene_symbols = path.split('\t')[2:] if path.split( '\t')[2:] else [] gene_matcher.genes = gene_symbols genes = set() for gene in gene_matcher.genes: if gene.gene_id is not None: genes.add(str(gene.gene_id)) pathway = path.split('\t')[0].replace(',', ' ') pathway_id = path.split('\t')[1].replace(',', ' ') gs = GeneSet(gs_id=pathway_id, name=pathway, genes=genes, hierarchy=('Reactome', 'pathways'), organism='9606', link=detail_link.format(pathway_id)) genesets.append(gs) for gs_group in GeneSets(genesets).split_by_hierarchy(): hierarchy = gs_group.common_hierarchy() gs_group.to_gmt_file_format( f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
def go_gene_sets(org): """ Returns gene sets from GO. """ ontology = go.Ontology() annotations = go.Annotations(org, ontology=ontology) gene_sets = [] for termn, term in ontology.terms.items(): genes = annotations.get_genes_by_go_term(termn) hier = ('GO', term.namespace) if len(genes) > 0: gs = GeneSet(gs_id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=GO_TERM_LINK.format(termn)) gene_sets.append(gs) return GeneSets(gene_sets)
def kegg_gene_sets(tax_id: str) -> None: """ Returns gene sets from KEGG pathways. """ caching.clear_cache() kegg_org = kegg.KEGGOrganism(taxonomy.name(tax_id)) ncbi_id_mapper = kegg_org.kegg_to_ncbi_mapper() genesets = [] for id in kegg_org.pathways(): pway = kegg.KEGGPathway(id) hier = ('KEGG', 'Pathways') if pway.pathway_attributes(): kegg_names = kegg_org.get_genes_by_pathway(id) mapped_genes = set() for gene in kegg_names: try: mapped_genes.add(ncbi_id_mapper[gene.upper()]) except KeyError: # some kegg names can not be matched to ncbi ids # they are included in geneset anyway # remove prefix, that specifies kegg organism # mapped_genes.append(gene.split(':')[-1]) pass gs = GeneSet(gs_id=id, name=pway.title, genes=mapped_genes, hierarchy=hier, organism=tax_id, link=pway.link) genesets.append(gs) for gs_group in GeneSets(genesets).split_by_hierarchy(): hierarchy = gs_group.common_hierarchy() gs_group.to_gmt_file_format( f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
def gene_marker_sets(): file_names = ['panglao_gene_markers.tab', 'cellMarker_gene_markers.tab'] file_name_to_hier = { 'panglao_gene_markers.tab': 'Panglao', 'cellMarker_gene_markers.tab': 'CellMarker' } for file_name in file_names: file_path = f'{data_path}/marker_genes/{file_name}' sets_by_org = {'9606': defaultdict(list), '10090': defaultdict(list)} name_to_tax = {'Human': '9606', 'Mouse': '10090'} for row in Table(file_path): tax_id = name_to_tax[row['Organism']] cell_type = row['Cell Type'] gene_id = row['Entrez ID'] sets_by_org[tax_id][cell_type].append(gene_id) for tax_id, cell_types in sets_by_org.items(): gene_sets = [] for cell_type, genes in cell_types.items(): gs = GeneSet( gs_id=str(cell_type), name=str(cell_type), genes=set([str(gene) for gene in genes if gene != '?']), hierarchy=('Marker Genes', file_name_to_hier[file_name]), organism=tax_id, link='') gene_sets.append(gs) for gs_group in GeneSets(gene_sets).split_by_hierarchy(): hierarchy = gs_group.common_hierarchy() gs_group.to_gmt_file_format( f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
def clear_gene_sets(self): self.gs_object = GeneSets()