def _download_file(self, url, extract=True): """ Download the `file` from the ArrayExpress into a local repository directory. """ rest, basename = posixpath.split(url) dirname = posixpath.basename(rest) repo_dir = serverfiles.localpath("ArrayExpress", dirname) try: os.makedirs(repo_dir) except OSError: pass stream = urllib2.urlopen(url) local_filename = os.path.join(repo_dir, basename) shutil.copyfileobj(stream, open(local_filename, "wb")) if extract: _, extension = os.path.splitext(local_filename) if extension == ".zip": import zipfile zfile = zipfile.ZipFile(local_filename) zfile.extractall(repo_dir) elif extension == ".gz": import gzip gzfile = gzip.open(local_filename) gzfile.extractall(repo_dir) elif extension in [".tgz"]: import tarfile tfile = tarfile.TarFile(local_filename) tfile.extractall(repo_dir) elif extension == ".txt": pass else: raise ValueError("Unknown extension ('{0}').".format(basename))
def _local_filepath(self, url): """ Return the local file path for url. """ rest, basename = posixpath.split(url) dirname = posixpath.basename(rest) return serverfiles.localpath("ArrayExpress", os.path.join(dirname, basename))
def _cache(name="AtlasGeneResult.shelve"): """ Return a open cache instance (a shelve object). """ if not os.path.exists(serverfiles.localpath("GeneAtlas")): try: os.makedirs(serverfiles.localpath("GeneAtlas")) except OSError: pass cache = shelve.open(serverfiles.localpath("GeneAtlas", name)) if cache.get(name + "__CACHE_VERSION__", None) == CACHE_VERSION: return cache else: cache.close() cache = shelve.open(serverfiles.localpath("GeneAtlas", name), "n") cache[name + "__CACHE_VERSION__"] = CACHE_VERSION return cache
def _local_filepath(self, url): """ Return the local file path for url. """ rest, basename = posixpath.split(url) dirname = posixpath.basename(rest) return serverfiles.localpath( "ArrayExpress", os.path.join(dirname, basename))
def updateInfo(self): gds_info = self.gds_info text = ("%i datasets\n%i datasets cached\n" % (len(gds_info), len(glob.glob(serverfiles.localpath("GEO") + "/GDS*")))) filtered = self.treeWidget.model().rowCount() if len(self.gds) != filtered: text += ("%i after filtering") % filtered self.infoBox.setText(text)
def _updateToolTip(self): state_str = self.STATE_STRINGS[self.item.state] tooltip = ("State: %s\nTags: %s" % (state_str, ", ".join(tag for tag in self.item.tags if not tag.startswith("#")))) if self.item.state in [CURRENT, OUTDATED, DEPRECATED]: tooltip += ("\nFile: %s" % serverfiles.localpath(self.item.domain, self.item.filename)) for i in range(1, 4): self.setToolTip(i, tooltip)
def download_data(cls, address): """ Pass the address of the latest BIOGRID-ALL release (in tab2 format). """ stream = urllib2.urlopen(address) stream = StringIO(stream.read()) zfile = zipfile.ZipFile(stream) # Expecting only one file. filename = zfile.namelist()[0] filepath = serverfiles.localpath("PPI", "BIOGRID-ALL.tab2") mkdir_p(os.path.dirname(filepath)) with open(filepath, "wb") as f: shutil.copyfileobj(zfile.open(filename, "r"), f) cls.init_db(filepath)
def __init__(self, taxid=None, database=None, detailed_database=None): STRING.__init__(self, taxid, database) if taxid is not None and detailed_database is not None: raise ValueError("taxid and detailed_database are exclusive") db_file = serverfiles.localpath(self.DOMAIN, self.FILENAME) if taxid is not None and detailed_database is None: detailed_database = serverfiles.localpath_download( self.DOMAIN, self.FILENAME_DETAILED.format(taxid=taxid) ) elif taxid is None and detailed_database is not None: detailed_database = detailed_database elif taxid is None and detailed_database is None: # Back compatibility detailed_database = serverfiles.localpath_download( "PPI", "string-protein-detailed.sqlite") self.db_detailed = sqlite3.connect(detailed_database) self.db_detailed.execute("ATTACH DATABASE ? as string", (db_file,))
def fgem_to_table(self): """ Retrieve the processed matrix from the Array Express FTP server and convert it to a :class:`Orange.data.Table`. """ assert (self.fgemdatafiles) repo_dir = serverfiles.localpath("ArrayExpress", self.accession) # Find the file listing the data matrix files # (should be in sdrf but sometimes it is in 2column file only, why?) sdrf = self._search_files("sdrf", "txt") if sdrf: sdrf = SampleDataRelationship(self._open(sdrf[0].get("url"))) if "Derived Array Data Matrix File" not in sdrf.header: twocol = self._search_files("twocolumn", "txt") if twocol: sdrf = SampleDataRelationship( self._open(twocol[0].get("url"))) matrix_file = self._search_files("fgem")[0] self._open(matrix_file.get("url")) idf_file = self._search_files("idf", "txt")[0] self._open(idf_file.get("url")) # To download if not cached return mage_tab_to_orange(os.path.join(repo_dir, idf_file.get("name")))
def _updateToolTip(self): state_str = self.STATE_STRINGS[self.item.state] try: diff_date = self.item.latest - self.item.local except: diff_date = None tooltip = ("State: %s\nTags: %s" % (state_str, ", ".join(tag for tag in self.item.tags if not tag.startswith("#")))) if self.item.state in [CURRENT, OUTDATED, DEPRECATED]: tooltip += ("\nFile: %s" % serverfiles.localpath(self.item.domain, self.item.filename)) if self.item.state == OUTDATED and diff_date: tooltip += ("\nServer version: %s\nStatus: old (%d days)" % (self.item.latest, diff_date.days)) else: tooltip += ("\nServer version: %s" % self.item.latest) for i in range(1, 4): self.setToolTip(i, tooltip)
def fgem_to_table(self): """ Retrieve the processed matrix from the Array Express FTP server and convert it to a :class:`Orange.data.Table`. """ assert(self.fgemdatafiles) repo_dir = serverfiles.localpath("ArrayExpress", self.accession) # Find the file listing the data matrix files # (should be in sdrf but sometimes it is in 2column file only, why?) sdrf = self._search_files("sdrf", "txt") if sdrf: sdrf = SampleDataRelationship(self._open(sdrf[0].get("url"))) if "Derived Array Data Matrix File" not in sdrf.header: twocol = self._search_files("twocolumn", "txt") if twocol: sdrf = SampleDataRelationship( self._open(twocol[0].get("url")) ) matrix_file = self._search_files("fgem")[0] self._open(matrix_file.get("url")) idf_file = self._search_files("idf", "txt")[0] self._open(idf_file.get("url")) # To download if not cached return mage_tab_to_orange(os.path.join(repo_dir, idf_file.get("name")))
def get_gds_model(progress=lambda val: None): """ Initialize and return a GDS datasets model. :param progress: A progress callback. :rval tuple: A tuple of (QStandardItemModel, geo.GDSInfo, [geo.GDS]) .. note:: The returned QStandardItemModel's thread affinity is set to the GUI thread. """ progress(1) info = geo.GDSInfo() search_keys = ["dataset_id", "title", "platform_organism", "description"] cache_dir = serverfiles.localpath(geo.DOMAIN) gds_link = "http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc={0}" pm_link = "http://www.ncbi.nlm.nih.gov/pubmed/{0}" gds_list = [] def is_cached(gds): return os.path.exists(os.path.join(cache_dir, gds["dataset_id"]) + ".soft.gz") def item(displayvalue, item_values={}): item = QStandardItem() item.setData(displayvalue, Qt.DisplayRole) for role, value in item_values.iteritems(): item.setData(value, role) return item def gds_to_row(gds): #: Text for easier full search. search_text = unicode( " | ".join([gds.get(key, "").lower() for key in search_keys]), errors="ignore" ) row = [ item(" " if is_cached(gds) else "", {TextFilterRole: search_text}), item(gds["dataset_id"], {LinkRole: gds_link.format(gds["dataset_id"])}), item(gds["title"]), item(gds["platform_organism"]), item(len(gds["samples"])), item(gds["feature_count"]), item(gds["gene_count"]), item(len(gds["subsets"])), item(gds.get("pubmed_id", ""), {LinkRole: pm_link.format(gds["pubmed_id"]) if gds.get("pubmed_id") else QVariant()}) ] return row model = QStandardItemModel() model.setHorizontalHeaderLabels( ["", "ID", "Title", "Organism", "Samples", "Features", "Genes", "Subsets", "PubMedID"] ) progress(20) for gds in info.values(): model.appendRow(gds_to_row(gds)) gds_list.append(gds) progress(50) if QThread.currentThread() is not QCoreApplication.instance().thread(): model.moveToThread(QCoreApplication.instance().thread()) return model, info, gds_list
def _extract(self): self._tmpfile.seek(0, 0) archive = tarfile.open(fileobj=self._tmpfile) target_dir = serverfiles.localpath() archive.extractall(target_dir)
def init_db(cls, version, taxid, cache_dir=None, dbfilename=None): if cache_dir is None: cache_dir = serverfiles.localpath(cls.DOMAIN) if dbfilename is None: dbfilename = serverfiles.localpath( cls.DOMAIN, "string-protein-detailed.{taxid}.sqlite".format(taxid=taxid) ) pjoin = os.path.join base_url = "http://string-db.org/newstring_download/" filename = "{taxid}.protein.links.detailed.{version}.txt.gz" filename = filename.format(version=version, taxid=taxid) url = base_url + "protein.links.detailed.{version}/" + filename url = url.format(version=version) if not os.path.exists(pjoin(cache_dir, filename)): wget(url, cache_dir, progress=True) links_fileobj = open(pjoin(cache_dir, filename), "rb") links_file = gzip.GzipFile(fileobj=links_fileobj) con = sqlite3.connect(dbfilename) with con: con.execute(""" DROP TABLE IF EXISTS evidence """) con.execute(""" CREATE TABLE evidence( protein_id1 TEXT, protein_id2 TEXT, neighborhood INTEGER, fusion INTEGER, cooccurence INTEGER, coexpression INTEGER, experimental INTEGER, database INTEGER, textmining INTEGER ) """) links = csv.reader(links_file, delimiter=" ") links.next() # Read header filesize = os.stat(pjoin(cache_dir, filename)).st_size progress = ConsoleProgressBar("Processing links file:") progress(1.0) def read_links(reader): for i, (p1, p2, n, f, c, cx, ex, db, t, _) in \ enumerate(reader): yield p1, p2, n, f, c, cx, ex, db, t if i % 10000 == 0: progress(100.0 * links_fileobj.tell() / filesize) con.executemany(""" INSERT INTO evidence VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) """, read_links(links)) progress.finish() print "Indexing" con.execute("""\ CREATE INDEX IF NOT EXISTS index_evidence ON evidence (protein_id1, protein_id2) """) con.executescript(""" DROP TABLE IF EXISTS version; CREATE TABLE version ( string_version text, api_version text ); """) con.execute(""" INSERT INTO version VALUES (?, ?)""", (version, cls.VERSION))
def download(cls): src = urllib2.urlopen("http://mips.helmholtz-muenchen.de/proj/ppi/data/mppi.gz") dest = serverfiles.localpath("PPI", "mppi.gz") shutil.copyfileobj(src, open(dest, "wb"))
def __init__(self, parent=None, signalManager=None, name="Image viewer"): OWWidget.__init__(self, parent, signalManager, name, wantGraph=True) self.inputs = [("Data", ExampleTable, self.setData)] self.outputs = [("Data", ExampleTable)] self.imageAttr = 0 self.titleAttr = 0 self.zoom = 25 self.autoCommit = False self.selectionChangedFlag = False # # GUI # self.loadSettings() self.info = OWGUI.widgetLabel( OWGUI.widgetBox(self.controlArea, "Info"), "Waiting for input\n") self.imageAttrCB = OWGUI.comboBox( self.controlArea, self, "imageAttr", box="Image Filename Attribute", tooltip="Attribute with image filenames", callback=[self.clearScene, self.setupScene], addSpace=True) self.titleAttrCB = OWGUI.comboBox(self.controlArea, self, "titleAttr", box="Title Attribute", tooltip="Attribute with image title", callback=self.updateTitles, addSpace=True) OWGUI.hSlider(self.controlArea, self, "zoom", box="Zoom", minValue=1, maxValue=100, step=1, callback=self.updateZoom, createLabel=False) OWGUI.separator(self.controlArea) box = OWGUI.widgetBox(self.controlArea, "Selection") b = OWGUI.button(box, self, "Commit", callback=self.commit) cb = OWGUI.checkBox(box, self, "autoCommit", "Commit on any change", tooltip="Send selections on any change", callback=self.commitIf) OWGUI.setStopper(self, b, cb, "selectionChangedFlag", callback=self.commit) OWGUI.rubber(self.controlArea) self.scene = GraphicsScene() self.sceneView = QGraphicsView(self.scene, self) self.sceneView.setAlignment(Qt.AlignTop | Qt.AlignLeft) self.sceneView.setRenderHint(QPainter.Antialiasing, True) self.sceneView.setRenderHint(QPainter.TextAntialiasing, True) self.sceneView.setFocusPolicy(Qt.WheelFocus) self.sceneView.setVerticalScrollBarPolicy(Qt.ScrollBarAlwaysOn) self.sceneView.installEventFilter(self) self.mainArea.layout().addWidget(self.sceneView) self.scene.selectionChanged.connect(self.onSelectionChanged) self.scene.selectionRectPointChanged.connect( self.onSelectionRectPointChanged, Qt.QueuedConnection) self.graphButton.clicked.connect(self.saveScene) self.resize(800, 600) self.thumbnailWidget = None self.sceneLayout = None self.selectedExamples = [] #: List of _ImageItems self.items = [] self._errcount = 0 self._successcount = 0 self.loader = ImageLoader(self) # Add the "orange-sf" path prefix for locating files # distributed using `serverfiles`. sfdir = serverfiles.localpath() if sfdir not in [unicode(p) for p in QDir.searchPaths("orange-sf")]: QDir.addSearchPath("orange-sf", sfdir)
def default_db_filename(cls, taxid): return serverfiles.localpath( cls.DOMAIN, cls.FILENAME.format(taxid=taxid))
def __init__(self, parent=None, signalManager=None, name="Image viewer"): OWWidget.__init__(self, parent, signalManager, name, wantGraph=True) self.inputs = [("Data", ExampleTable, self.setData)] self.outputs = [("Data", ExampleTable)] self.imageAttr = 0 self.titleAttr = 0 self.zoom = 25 self.autoCommit = False self.selectionChangedFlag = False # # GUI # self.loadSettings() self.info = OWGUI.widgetLabel( OWGUI.widgetBox(self.controlArea, "Info"), "Waiting for input\n" ) self.imageAttrCB = OWGUI.comboBox( self.controlArea, self, "imageAttr", box="Image Filename Attribute", tooltip="Attribute with image filenames", callback=[self.clearScene, self.setupScene], addSpace=True ) self.titleAttrCB = OWGUI.comboBox( self.controlArea, self, "titleAttr", box="Title Attribute", tooltip="Attribute with image title", callback=self.updateTitles, addSpace=True ) OWGUI.hSlider( self.controlArea, self, "zoom", box="Zoom", minValue=1, maxValue=100, step=1, callback=self.updateZoom, createLabel=False ) OWGUI.separator(self.controlArea) box = OWGUI.widgetBox(self.controlArea, "Selection") b = OWGUI.button(box, self, "Commit", callback=self.commit) cb = OWGUI.checkBox( box, self, "autoCommit", "Commit on any change", tooltip="Send selections on any change", callback=self.commitIf ) OWGUI.setStopper(self, b, cb, "selectionChangedFlag", callback=self.commit) OWGUI.rubber(self.controlArea) self.scene = GraphicsScene() self.sceneView = QGraphicsView(self.scene, self) self.sceneView.setAlignment(Qt.AlignTop | Qt.AlignLeft) self.sceneView.setRenderHint(QPainter.Antialiasing, True) self.sceneView.setRenderHint(QPainter.TextAntialiasing, True) self.sceneView.setFocusPolicy(Qt.WheelFocus) self.sceneView.setVerticalScrollBarPolicy(Qt.ScrollBarAlwaysOn) self.sceneView.installEventFilter(self) self.mainArea.layout().addWidget(self.sceneView) self.scene.selectionChanged.connect(self.onSelectionChanged) self.scene.selectionRectPointChanged.connect( self.onSelectionRectPointChanged, Qt.QueuedConnection ) self.graphButton.clicked.connect(self.saveScene) self.resize(800, 600) self.thumbnailWidget = None self.sceneLayout = None self.selectedExamples = [] #: List of _ImageItems self.items = [] self._errcount = 0 self._successcount = 0 self.loader = ImageLoader(self) # Add the "orange-sf" path prefix for locating files # distributed using `serverfiles`. sfdir = serverfiles.localpath() if sfdir not in [unicode(p) for p in QDir.searchPaths("orange-sf")]: QDir.addSearchPath("orange-sf", sfdir)
class ArrayExpressConnection(object): """ Constructs and runs REST query on ArrayExpress. :param address: Address of the ArrayExpress API. :param timeout: Timeout for the connection. """ DEFAULT_ADDRESS = "http://www.ebi.ac.uk/arrayexpress/{format}/v2/" DEFAULT_FORMAT = "json" DEFAULT_CACHE = serverfiles.localpath("ArrayExpress", "ArrayExpressCache.shelve") # Order of arguments in the query _ARGS_ORDER = ["keywords", "species", "array"] def __init__(self, address=None, timeout=30, cache=None, username=None, password=None): self.address = address if address is not None else self.DEFAULT_ADDRESS self.timeout = timeout self.cache = cache if cache is not None else self.DEFAULT_CACHE self.username = username self.password = password def format_query(self, **kwargs): """Format the query arguments in `kwargs`. >>> conn.format_query(gxa=True, efcount=(1, 5)) 'efcount=[1 TO 5]&gxa=true' """ # Formaters: def format_default(val): if isinstance(val, basestring): return val else: return "+".join(val) def format_species(val): return '"%s"' % val.lower() def format_gxa(val): if val: return "true" else: raise ValueError("gxa={0}".format(val)) def format_expandefo(val): if val: return "on" else: raise ValueError("expandefo={0}".format(val)) def format_true_false(val): return "true" if val else "false" def format_interval(val): if isinstance(val, tuple): return "[{0} TO {1}]".format(*val) else: raise ValueError("Must be an interval argument (min, max)!") def format_date(val): # TODO check if val contains a datetime.date object # assert proper format return format_interval(val) def format_wholewords(val): if val: return "on" else: raise ValueError("wholewords={0}".format(val)) formaters = { "species": format_species, "gxa": format_gxa, "expandefo": format_expandefo, "directsub": format_true_false, "assaycount": format_interval, "efcount": format_interval, "samplecount": format_interval, "sacount": format_interval, "rawcount": format_interval, "fgemcount": format_interval, "miamescore": format_interval, "date": format_date, "wholewords": format_wholewords, } parts = [] arg_items = kwargs.items() arg_items = sorted(arg_items, key=lambda arg: self._ARGS_ORDER.index(arg[0]) if arg[0] in self._ARGS_ORDER else 100) for key, value in arg_items: if key == "format": continue # format is handled in query_url if key not in ARRAYEXPRESS_FIELDS: raise ValueError("Invalid argument name: '{0}'".format(key)) if value is not None and value != []: fmt = formaters.get(key, format_default) value = fmt(value) parts.append("{0}={1}".format(key, value)) return "&".join(parts) def query_url(self, what="experiments", **kwargs): """Return a formatted query URL for the query arguments. >>> conn.query_url(accession="E-MEXP-31") 'http://www.ebi.ac.uk/arrayexpress/json/v2/experiments?accession=E-MEXP-31' """ query = self.format_query(**kwargs) url = posixpath.join(self.address, what) url = url.format(format=kwargs.get("format", self.DEFAULT_FORMAT)) url = url + ("?" + query if query else "") url = url.replace(" ", "%20") return url def query_url_experiments(self, **kwargs): """Return query URL of formatted experiments for the query arguments. """ return self.query_url("experiments", **kwargs) def query_url_files(self, **kwargs): """ Return query URL of formatted experiments for the query arguments. """ return self.query_url("files", **kwargs) def query_experiment(self, **kwargs): """Return an open stream to the experiments query results. Takes the same arguments as the :obj:`query_experiments` function. """ url = self.query_url_experiments(**kwargs) stream = self._cache_urlopen(url, timeout=self.timeout) return stream def query_files(self, **kwargs): """Return an open stream to the files query results. Takes the same arguments as the :obj:`query_files` function. """ url = self.query_url_files(**kwargs) stream = self._cache_urlopen(url, timeout=self.timeout) return stream def open_file(self, accession, kind="raw", ext=None): """ Return a file handle to experiment data. :param str accession: :param str kind: Experiment data type. Possible values for the parameter `kind`: - raw: return the raw data if available - processed: return the processed data if available - biosamples: a png or svg design image - idf: investigation description - adf: array design description - mageml: MAGE-ML file Example:: >>> raw_file = conn.open_file("E-TABM-1087", kind="raw") >>> processed_file = conn.open_file("E-TABM-1087", kind="processed") """ stream = self.query_files(accession=accession, format="json") data = json.load(stream) try: files = data["files"]["experiment"]["file"] except KeyError: raise ValueError(accession) for file in files: filekind = file["kind"] fileext = file["extension"] if (filekind == kind) and (fileext == ext or ext is None): url = file["url"] return self._cache_urlopen(str(url), timeout=self.timeout) raise ValueError("%s does not have a file of kind: %r" % (accession, kind)) def _cache_urlopen(self, url, timeout=30): if self.cache is not None: with self.open_cache("r") as cache: if url in cache: return StringIO(cache[url]) stream = urllib2.urlopen(url, timeout=timeout) data = stream.read() with self.open_cache("w") as cache: cache[url] = data return StringIO(data) else: return urllib2.urlopen(url, timeout=timeout) def open_cache(self, flag="r"): if isinstance(self.cache, basestring): try: return closing(_open_shelve(self.cache, flag)) except Exception: return _fake_closing({}) elif hasattr(self.cache, "close"): return closing(self.cache) elif self.cache is None: return _fake_closing({}) else: return _fake_closing(self.cache)
def init_db(cls, version, taxid, cache_dir=None, dbfilename=None): if cache_dir is None: cache_dir = serverfiles.localpath(cls.DOMAIN) if dbfilename is None: dbfilename = cls.default_db_filename(taxid) pjoin = os.path.join base_url = "http://string-db.org/newstring_download/" def paths(flatfile): url = "{flatfile}.{version}/{taxid}.{flatfile}.{version}.txt.gz" url = url.format(flatfile=flatfile, version=version, taxid=taxid) return posixpath.basename(url), base_url + url def ffname(pattern): return pattern.format(taxid=taxid, version=version) links_filename, links_url = paths("protein.links") actions_filename, actions_url = paths("protein.actions") aliases_filename, aliases_url = paths("protein.aliases") def download(filename, url): with open(pjoin(cache_dir, filename + ".tmp"), "wb") as dest: wget(url, dst_obj=dest, progress=True) shutil.move(pjoin(cache_dir, filename + ".tmp"), pjoin(cache_dir, filename)) for fname, url in [(links_filename, links_url), (actions_filename, actions_url), (aliases_filename, aliases_url)]: if not os.path.exists(pjoin(cache_dir, fname)): download(fname, url) links_fileobj = open(pjoin(cache_dir, links_filename), "rb") actions_fileobj = open(pjoin(cache_dir, actions_filename), "rb") aliases_fileobj = open(pjoin(cache_dir, aliases_filename), "rb") links_file = gzip.GzipFile(fileobj=links_fileobj) actions_file = gzip.GzipFile(fileobj=actions_fileobj) aliases_file = gzip.GzipFile(fileobj=aliases_fileobj) progress = ConsoleProgressBar("Processing {}:".format(links_filename)) progress(0.0) def st_size(filename): return os.stat(pjoin(cache_dir, filename)).st_size filesize = st_size(links_filename) con = sqlite3.connect(dbfilename) with con: cls.clear_db(con) links_file.readline() # read the header line reader = csv.reader(links_file, delimiter=" ") def read_links(reader, progress): for i, (p1, p2, score) in enumerate(reader): yield p1, p2, int(score) if i % 100000 == 0: # Update the progress every 100000 lines progress(100.0 * links_fileobj.tell() / filesize) con.executemany("INSERT INTO links VALUES (?, ?, ?)", read_links(reader, progress)) progress.finish() def part(string, sep, part): return string.split(sep)[part] con.create_function("part", 3, part) con.execute(""" INSERT INTO proteins SELECT protein_id1, part(protein_id1, '.', 0) FROM (SELECT DISTINCT(protein_id1) FROM links ORDER BY protein_id1) """) filesize = st_size(actions_filename) actions_file.readline() # read header line progress = ConsoleProgressBar("Processing actions:") reader = csv.reader(actions_file, delimiter="\t") def read_actions(reader): for i, (p1, p2, mode, action, a_is_acting, score) in \ enumerate(reader): yield p1, p2, mode, action, int(score) if i % 10000 == 0: progress(100.0 * actions_fileobj.tell() / filesize) con.executemany("INSERT INTO actions VALUES (?, ?, ?, ?, ?)", read_actions(reader)) progress.finish() filesize = st_size(aliases_filename) aliases_file.readline() # read header line progress = ConsoleProgressBar("Processing aliases:") reader = csv.reader(aliases_file, delimiter="\t") def read_aliases(reader, progress): for i, (taxid, name, alias, source) in enumerate(reader): yield (".".join([taxid, name]), alias.decode("utf-8", errors="ignore"), source.decode("utf-8", errors="ignore")) if i % 10000 == 0: progress(100.0 * aliases_fileobj.tell() / filesize) con.executemany("INSERT INTO aliases VALUES (?, ?, ?)", read_aliases(reader, progress)) progress.finish() print "Indexing the database" cls.create_db_index(con) con.executescript(""" DROP TABLE IF EXISTS version; CREATE TABLE version ( string_version text, api_version text );""") con.execute(""" INSERT INTO version VALUES (?, ?)""", (version, cls.VERSION))
class GeneExpressionAtlasConenction(object): """ A connection to Gene Expression Atlas database. :param address: Address of the GXA server (default: http://www-test.ebi.ac.uk/gxa/api/deprecated). :param timeout: Socket timeout (default 30). :param cache: A dict like object to use as a cache. """ DEFAULT_ADDRESS = "http://www-test.ebi.ac.uk/gxa/api/deprecated" DEFAULT_CACHE = serverfiles.localpath("GeneAtlas", "GeneAtlasConnectionCache.shelve") def __init__(self, address=None, timeout=30, cache=None): self.address = address if address is not None else self.DEFAULT_ADDRESS self.timeout = timeout self.cache = cache if cache is not None else self.DEFAULT_CACHE def query(self, condition, format="json", start=None, rows=None, indent=False): warnings.warn( "The Gene Expression Atlas REST api has been deprecated and " + "will be removed in the future.", UserWarning) url = self.address + "?" + condition.rest() if start is not None and rows is not None: url += "&start={0}&rows={1}".format(start, rows) url += "&format={0}".format(format) if indent: url += "&indent" #print url if self.cache is not None: return self._query_cached(url, format) else: return urllib2.urlopen(url) def _query_cached(self, url, format): if self.cache is not None: with self.open_cache("r") as cache: if url in cache: return StringIO(cache[url]) response = urllib2.urlopen(url) contents = response.read() # Test if the contents is a valid json or xml string (some # times the stream just stops in the middle, so this makes # sure we don't cache an invalid response # TODO: what about errors (e.g. 'cannot handle the # query in a timely fashion' if format == "json": parse_json(StringIO(contents)) else: parse_xml(StringIO(contents)) with self.open_cache("w") as cache: cache[url] = contents return StringIO(contents) else: return urllib2.urlopen(url) def open_cache(self, flag="r"): """ Return a context manager for a dict like object. """ if isinstance(self.cache, basestring): try: return closing(_open_shelve(self.cache, flag)) except Exception: return fake_closing({}) else: return fake_closing(self.cache)