def download_from_NCBI(cls, file=None): data = urllib2.urlopen("ftp://ftp.ncbi.nlm.nih.gov/pub/HomoloGene/current/homologene.data") if file is None: try: os.mkdir(orngServerFiles.localpath("HomoloGene")) except OSError: pass file = open(orngServerFiles.localpath("HomoloGene", "homologene.data"), "wb") elif type(file) in [str, unicode]: file = open(file, "wb") shutil.copyfileobj(data, file) file.close()
def load(self): import cPickle dir = orngServerFiles.localpath("EnsembleGeneInfo") if not os.path.exists(dir): os.makedirs(dir) try: filename = orngServerFiles.localpath_download("EnsembleGeneInfo", self.filename()) info = cPickle.load(open(filename, "rb")) except Exception, ex: filename = orngServerFiles.localpath("EnsembleGeneInfo", self.filename()) info = self.create_info() cPickle.dump(info, open(filename, "wb"))
def load(self): import cPickle dir = orngServerFiles.localpath("EnsembleGeneInfo") if not os.path.exists(dir): os.makedirs(dir) try: filename = orngServerFiles.localpath_download( "EnsembleGeneInfo", self.filename()) info = cPickle.load(open(filename, "rb")) except Exception, ex: filename = orngServerFiles.localpath("EnsembleGeneInfo", self.filename()) info = self.create_info() cPickle.dump(info, open(filename, "wb"))
def download_from_NCBI(cls, file=None): data = urllib2.urlopen( "ftp://ftp.ncbi.nlm.nih.gov/pub/HomoloGene/current/homologene.data" ) if file is None: try: os.mkdir(orngServerFiles.localpath("HomoloGene")) except OSError: pass file = open( orngServerFiles.localpath("HomoloGene", "homologene.data"), "wb") elif type(file) in [str, unicode]: file = open(file, "wb") shutil.copyfileobj(data, file) file.close()
def download_data(cls, taxid=None, progress_callback=None): """ Download the data for ``taxid`` from the GeneMANIA website and initialize the local database. """ import tarfile baseurl = "http://genemania.org/data/current/" directory = orngServerFiles.localpath("PPI") if not os.path.exists(directory): os.makedirs(directory) if taxid is None: taxid = cls.common_taxids() if isinstance(taxid, (list, tuple)): taxids = taxid else: taxids = [taxid] for taxid in taxids: name = obiTaxonomy.name(taxid) name = name.replace(" ", "_") if progress_callback is None: progress = True #orngServerFiles.ConsoleProgressBar("Downloading %r." % filename) else: progress = progress_callback filename = name + ".tgz" url = baseurl + "networks/" + filename wget(url, directory=directory, progress=progress) tgz_filename = os.path.join(directory, filename) tgz = tarfile.open(tgz_filename) tgz.extractall(directory) filename = name + ".COMBINED.tgz" url = baseurl + "precombined/" + filename wget(url, directory=directory, progress=progress) tgz_filename = os.path.join(directory, filename) tgz = tarfile.open(tgz_filename) tgz.extractall(directory) cls.init_db([taxid])
def list_serverfiles_from_flist(flist): gs_files = filter(is_genesets_file, flist) localfiles = os.listdir(orngServerFiles.localpath(sfdomain)) localfiles = set(filter(is_genesets_file, localfiles)) return [ filename_parse(fn) + \ ((True,) if fn in localfiles else (False,)) for fn in set(gs_files) | localfiles ]
def init_db(cls, taxid=None): """ Init the local data base. """ from functools import partial directory = orngServerFiles.localpath("PPI") pjoin = partial(os.path.join, directory) if taxid is None: taxid = cls.common_taxids() if isinstance(taxid, (list, tuple)): for tid in taxid: cls.init_db(tid) return if not isinstance(taxid, basestring): raise ValueError("wrong taxid") # taxid = taxids name = obiTaxonomy.name(taxid).replace(" ", "_") networks = csv.reader(open(pjoin(name, "networks.txt")), delimiter="\t") networks.next() # Header networks = list(networks) database = pjoin(cls.SERVER_FILE.format(taxid=taxid)) with sqlite3.connect(database) as con: con.execute("""DROP TABLE IF EXISTS genes""") con.execute("""DROP TABLE IF EXISTS synonyms""") con.execute("""DROP TABLE IF EXISTS source""") con.execute("""DROP TABLE IF EXISTS links""") con.execute("""DROP TABLE IF EXISTS networks""") con.execute("""DROP INDEX IF EXISTS genes_index""") con.execute("""DROP INDEX IF EXISTS links_index_a""") con.execute("""DROP INDEX IF EXISTS links_index_b""") con.execute("""\ CREATE TABLE networks (network_id INTEGER PRIMARY KEY ASC AUTOINCREMENT, network_name TEXT, network_group TEXT, source TEXT, pubmed_id TEXT )""") con.executemany("""\ INSERT INTO networks VALUES (?, ?, ?, ?, ?)""", [(i, r[2], r[1], r[3], r[4]) \ for i, r in enumerate(networks)]) con.execute("""\ CREATE TABLE genes (internal_id INTEGER PRIMARY KEY ASC AUTOINCREMENT, gene_name TEXT )""") identifiers = csv.reader(open( pjoin(name, "identifier_mappings.txt"), "rb"), delimiter="\t") identifiers.next() # skip header identifiers = list(identifiers) genes = sorted(set(r[0] for r in identifiers)) sources = sorted(set(r[2] for r in identifiers)) con.executemany( """\ INSERT INTO genes VALUES (?, ?)""", enumerate(genes)) con.execute("""\ CREATE TABLE source (source_id INTEGER PRIMARY KEY ASC AUTOINCREMENT, source_name TEXT )""") con.executemany( """\ INSERT INTO source VALUES (?, ?)""", enumerate(sources)) con.execute("""\ CREATE TABLE synonyms (internal_id INTEGER REFERENCES genes (internal_id), synonym TEXT, source_id INT REFERENCES source (source_id) )""") gene_to_id = dict((g, i) for i, g in enumerate(genes)) source_to_id = dict((s, i) for i, s in enumerate(sources)) con.executemany("""\ INSERT INTO synonyms VALUES (?, ?, ?)""", [(gene_to_id[r[0]], r[1], source_to_id[r[2]])\ for r in identifiers]) con.execute("""\ CREATE TABLE links (gene_a INTEGER REFERENCES genes (internal_id), gene_b INTEGER REFERENCES genes (internal_id), network_id INTEGER REFERENCES networks (network_id), weight REAL -- , PRIMARY KEY (gene_a, gene_b, network_id) )""") for i, (filename, group, _, _, _) in enumerate(networks): nf = open(pjoin(name, filename), "rb") interactions = csv.reader(nf, delimiter="\t") interactions.next() # skip header con.executemany("""\ INSERT INTO links VALUES (?, ?, ?, ?)""", [(gene_to_id[r[0]], gene_to_id[r[1]], i, float(r[2])) \ for r in interactions] ) # Add special combined network entry combined_id = len(networks) con.execute( """\ INSERT INTO networks VALUES (?, ?, ?, ?, ?)""", (combined_id, "BP_COMBINING", "COMBINED", "GeneMANIA", "")) # Add the combined network links. combined = open( pjoin(name + ".COMBINED", "COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt"), "rb") combined = csv.reader(combined, delimiter="\t") combined.next() con.executemany("""\ INSERT INTO links VALUES (?, ?, ?, ?)""", ((gene_to_id[r[0]], gene_to_id[r[1]], combined_id, float(r[2])) \ for r in combined)) con.execute(""" CREATE VIEW IF NOT EXISTS links_annotated AS SELECT genes1.gene_name AS gene_name_a, genes2.gene_name AS gene_name_b, links.weight, networks.network_name, networks.network_group, networks.source, networks.pubmed_id FROM genes AS genes1 JOIN links ON genes1.internal_id=links.gene_a JOIN genes AS genes2 ON links.gene_b=genes2.internal_id JOIN networks ON links.network_id=networks.network_id """) con.execute("""\ CREATE INDEX IF NOT EXISTS genes_index ON genes (gene_name) """) con.execute("""\ CREATE INDEX IF NOT EXISTS links_index_a ON links (gene_a) """) con.execute("""\ CREATE INDEX IF NOT EXISTS links_index_b ON links (gene_b) """)
from __future__ import absolute_import import os, time from Orange.orng import orngServerFiles from .. import taxonomy as obiTaxonomy from .. import kegg as obiKEGG from .. import dicty as obiDicty from .. import biomart as obiBioMart from . import homology default_database_path = orngServerFiles.localpath("NCBI_geneinfo") class GeneInfo(object): """ An object representing the NCBI information for a gene. """ NCBI_GENEINFO_TAGS = ("tax_id", "gene_id", "symbol", "locus_tag", "synonyms", "dbXrefs", "chromosome", "map_location", "description", "type", "symbol_from_nomenclature_authority", "full_name_from_nomenclature_authority", "nomenclature_status", "other_designations", "modification_date") NCBI_MULTIPLE_CARDINALITY_TAGS = ("synonyms", "dbXrefs", "other_designations")
def init_db(cls, taxid=None): """ Init the local data base. """ from functools import partial directory = orngServerFiles.localpath("PPI") pjoin = partial(os.path.join, directory) if taxid is None: taxid = cls.common_taxids() if isinstance(taxid, (list, tuple)): for tid in taxid: cls.init_db(tid) return if not isinstance(taxid, basestring): raise ValueError("wrong taxid") # taxid = taxids name = obiTaxonomy.name(taxid).replace(" ", "_") networks = csv.reader(open(pjoin(name, "networks.txt")), delimiter="\t") networks.next() # Header networks = list(networks) database = pjoin(cls.SERVER_FILE.format(taxid=taxid)) with sqlite3.connect(database) as con: con.execute("""DROP TABLE IF EXISTS genes""") con.execute("""DROP TABLE IF EXISTS synonyms""") con.execute("""DROP TABLE IF EXISTS source""") con.execute("""DROP TABLE IF EXISTS links""") con.execute("""DROP TABLE IF EXISTS networks""") con.execute("""DROP INDEX IF EXISTS genes_index""") con.execute("""DROP INDEX IF EXISTS links_index_a""") con.execute("""DROP INDEX IF EXISTS links_index_b""") con.execute("""\ CREATE TABLE networks (network_id INTEGER PRIMARY KEY ASC AUTOINCREMENT, network_name TEXT, network_group TEXT, source TEXT, pubmed_id TEXT )""") con.executemany("""\ INSERT INTO networks VALUES (?, ?, ?, ?, ?)""", [(i, r[2], r[1], r[3], r[4]) \ for i, r in enumerate(networks)]) con.execute("""\ CREATE TABLE genes (internal_id INTEGER PRIMARY KEY ASC AUTOINCREMENT, gene_name TEXT )""") identifiers = csv.reader(open(pjoin(name, "identifier_mappings.txt"), "rb"), delimiter="\t") identifiers.next() # skip header identifiers = list(identifiers) genes = sorted(set(r[0] for r in identifiers)) sources = sorted(set(r[2] for r in identifiers)) con.executemany("""\ INSERT INTO genes VALUES (?, ?)""", enumerate(genes)) con.execute("""\ CREATE TABLE source (source_id INTEGER PRIMARY KEY ASC AUTOINCREMENT, source_name TEXT )""") con.executemany("""\ INSERT INTO source VALUES (?, ?)""", enumerate(sources)) con.execute("""\ CREATE TABLE synonyms (internal_id INTEGER REFERENCES genes (internal_id), synonym TEXT, source_id INT REFERENCES source (source_id) )""") gene_to_id = dict((g, i) for i, g in enumerate(genes)) source_to_id = dict((s, i) for i, s in enumerate(sources)) con.executemany("""\ INSERT INTO synonyms VALUES (?, ?, ?)""", [(gene_to_id[r[0]], r[1], source_to_id[r[2]])\ for r in identifiers]) con.execute("""\ CREATE TABLE links (gene_a INTEGER REFERENCES genes (internal_id), gene_b INTEGER REFERENCES genes (internal_id), network_id INTEGER REFERENCES networks (network_id), weight REAL -- , PRIMARY KEY (gene_a, gene_b, network_id) )""") for i, (filename, group, _, _, _) in enumerate(networks): nf = open(pjoin(name, filename), "rb") interactions = csv.reader(nf, delimiter="\t") interactions.next() # skip header con.executemany("""\ INSERT INTO links VALUES (?, ?, ?, ?)""", [(gene_to_id[r[0]], gene_to_id[r[1]], i, float(r[2])) \ for r in interactions] ) # Add special combined network entry combined_id = len(networks) con.execute("""\ INSERT INTO networks VALUES (?, ?, ?, ?, ?)""", (combined_id, "BP_COMBINING", "COMBINED", "GeneMANIA", "")) # Add the combined network links. combined = open(pjoin(name + ".COMBINED", "COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt"), "rb") combined = csv.reader(combined, delimiter="\t") combined.next() con.executemany("""\ INSERT INTO links VALUES (?, ?, ?, ?)""", ((gene_to_id[r[0]], gene_to_id[r[1]], combined_id, float(r[2])) \ for r in combined)) con.execute(""" CREATE VIEW IF NOT EXISTS links_annotated AS SELECT genes1.gene_name AS gene_name_a, genes2.gene_name AS gene_name_b, links.weight, networks.network_name, networks.network_group, networks.source, networks.pubmed_id FROM genes AS genes1 JOIN links ON genes1.internal_id=links.gene_a JOIN genes AS genes2 ON links.gene_b=genes2.internal_id JOIN networks ON links.network_id=networks.network_id """) con.execute("""\ CREATE INDEX IF NOT EXISTS genes_index ON genes (gene_name) """) con.execute("""\ CREATE INDEX IF NOT EXISTS links_index_a ON links (gene_a) """) con.execute("""\ CREATE INDEX IF NOT EXISTS links_index_b ON links (gene_b) """)
class DictyMutants(object): """ A collection of Dictybase mutants as a dictionary of :obj:`DictyMutant` objects. :param local_database_path: A path for storing D. dictyostelium mutants objects. If `None` then a default database path is used. """ VERSION = 1 DEFAULT_DATABASE_PATH = orngServerFiles.localpath( "DictyMutants") #use a default local folder for storing the genesets def __init__(self, local_database_path=None): self.local_database_path = local_database_path if local_database_path is not None else self.DEFAULT_DATABASE_PATH if not os.path.exists(self.local_database_path): os.mkdir(self.local_database_path) self._mutants = pickle.load( open(localpath_download(domain, pickle_file), "rb")) def update_file(self, name): url = "http://dictybase.org/db/cgi-bin/dictyBase/download/download.pl?area=mutant_phenotypes&ID=" filename = os.path.join(self.local_database_path, name) temp_file = os.path.join(self.local_database_path, name + "_temp") stream = urllib2.urlopen(url + name) with open(temp_file, "wb") as file: shutil.copyfileobj(stream, file) os.rename(temp_file, filename) return filename def load_mutants(self, file): data = open(file) #data_header = data.readline() data = data.read() return data.splitlines() def download_mutants(self): all_mutants = self.load_mutants(self.update_file("all-mutants.txt")) null_mutants = self.load_mutants(self.update_file("null-mutants.txt")) overexp_mutants = self.load_mutants( self.update_file("overexpression-mutants.txt")) multiple_mutants = self.load_mutants( self.update_file("multiple-mutants.txt")) develop_mutants = self.load_mutants( self.update_file("developmental-mutants.txt")) other_mutants = self.load_mutants( self.update_file("other-mutants.txt")) _mutants = [DictyMutant(mutant) for mutant in all_mutants] the_nulls = set([DictyMutant(line).name for line in null_mutants]) the_overexps = set( [DictyMutant(line).name for line in overexp_mutants]) the_multiples = set( [DictyMutant(line).name for line in multiple_mutants]) the_develops = set( [DictyMutant(line).name for line in develop_mutants]) the_others = set([DictyMutant(line).name for line in other_mutants]) for mutant in _mutants: if mutant.name in the_nulls: mutant.null = True if mutant.name in the_overexps: mutant.overexp = True if mutant.name in the_multiples: mutant.multiple = True if mutant.name in the_develops: mutant.develop = True if mutant.name in the_others: mutant.other = True final_mutants = {x: x for x in _mutants} return final_mutants def pickle_data(self): return pickle.dumps(self.download_mutants(), -1) @classmethod def get_instance(cls): if not hasattr(cls, "_shared_dict"): dicty = DictyMutants() cls._shared_dict = dicty.__dict__ instance = DictyMutants.__new__(DictyMutants) instance.__dict__ = cls._shared_dict return instance def mutants(self): return self._mutants.keys() def genes(self): return sorted( set( reduce( list.__add__, [self.mutant_genes(mutant) for mutant in self.mutants()], []))) def phenotypes(self): return sorted( set( reduce(list.__add__, [ self.mutant_phenotypes(mutant) for mutant in self.mutants() ], []))) def mutant_genes(self, mutant): return self._mutants[mutant].genes def mutant_phenotypes(self, mutant): return self._mutants[mutant].phenotypes def gene_mutants(self): dgm = defaultdict(set) for mutant, genes in [(mutant, self.mutant_genes(mutant)) for mutant in self.mutants()]: for gene in genes: dgm[gene].add(mutant) return dgm def phenotype_mutants(self): dpm = defaultdict(set) for mutant, phenotypes in [(mutant, self.mutant_phenotypes(mutant)) for mutant in self.mutants()]: for phenotype in phenotypes: dpm[phenotype].add(mutant) return dpm
class HomoloGene(_Homologs): DEFAULT_DATABASE_PATH = orngServerFiles.localpath("HomoloGene") VERSION = 1 DOMAIN = "HomoloGene" FILENAME = "homologene.data" def __init__(self, local_database_path=None): self.local_database_path = local_database_path if local_database_path else self.DEFAULT_DATABASE_PATH self.load() @classmethod def download_from_NCBI(cls, file=None): data = urllib2.urlopen( "ftp://ftp.ncbi.nlm.nih.gov/pub/HomoloGene/current/homologene.data" ) if file is None: try: os.mkdir(orngServerFiles.localpath("HomoloGene")) except OSError: pass file = open( orngServerFiles.localpath("HomoloGene", "homologene.data"), "wb") elif type(file) in [str, unicode]: file = open(file, "wb") shutil.copyfileobj(data, file) file.close() @classmethod def get_instance(cls): if not hasattr(cls, "_shared_dict"): h = cls() cls._shared_dict = h.__dict__ h = cls.__new__(cls) h.__dict__ = cls._shared_dict return h def load(self): path = orngServerFiles.localpath_download(self.DOMAIN, self.FILENAME) lines = open(path, "rb").read().splitlines()[:-1] self._homologs = {} self._homologs = dict([((h.taxonomy_id, h.gene_symbol), h) for h in [_homolog(line) for line in lines]]) self._homologs_by_group = reduce( lambda dict, h: dict[h.group_id].append(h) or dict, self._homologs.values(), defaultdict(list)) # for line in lines: # h = _homolog(line) # self._homologs[h.taxonomy_id, h.gene_symbol] = h # self._homologs_by_group[h.group_id].append(h) def all_genes(self, taxid=None): return [ homolog.gene_symbol for (tid, id), homolog in self._homologs.iteritems() if tid == taxid ] def homologs(self, gene, taxid): group = self._homologs.get((taxid, gene), _homolog("")).group_id homologs = self._homologs_by_group[group] return [(h.taxonomy_id, h.gene_symbol) for h in homologs] def homolog(self, gene, taxid, homolotaxid): homologs = dict(self.homologs(gene, taxid)) return homologs.get(homolotaxid, None)
from __future__ import absolute_import import os, time from Orange.orng import orngServerFiles from .. import taxonomy as obiTaxonomy from .. import kegg as obiKEGG from .. import dicty as obiDicty from .. import biomart as obiBioMart from . import homology default_database_path = orngServerFiles.localpath("NCBI_geneinfo") class GeneInfo(object): """ An object representing the NCBI information for a gene. """ NCBI_GENEINFO_TAGS = ("tax_id", "gene_id", "symbol", "locus_tag", "synonyms", "dbXrefs", "chromosome", "map_location", "description", "type", "symbol_from_nomenclature_authority", "full_name_from_nomenclature_authority", "nomenclature_status", "other_designations", "modification_date") NCBI_MULTIPLE_CARDINALITY_TAGS = ("synonyms", "dbXrefs", "other_designations") __slots__ = NCBI_GENEINFO_TAGS def __init__(self, line): """ Construct the GeneInfo object from a line in the NCBI gene_info file """ line = line.split("\t")