def _local_filepath(self, url): """ Return the local file path for url. """ rest, basename = posixpath.split(url) dirname = posixpath.basename(rest) return serverfiles.localpath( "ArrayExpress", os.path.join(dirname, basename))
def fgem_to_table(self): """ Retrieve the processed matrix from the Array Express FTP server and convert it to a :class:`Orange.data.Table`. """ assert(self.fgemdatafiles) repo_dir = serverfiles.localpath("ArrayExpress", self.accession) # Find the file listing the data matrix files # (should be in sdrf but sometimes it is in 2column file only, why?) sdrf = self._search_files("sdrf", "txt") if sdrf: sdrf = SampleDataRelationship( io.TextIOWrapper(self._open(sdrf[0].get("url")), encoding="utf-8"), ) if "Derived Array Data Matrix File" not in sdrf.header: twocol = self._search_files("twocolumn", "txt") if twocol: sdrf = SampleDataRelationship( io.TextIOWrapper(self._open(twocol[0].get("url")), encoding="utf-8") ) matrix_file = self._search_files("fgem")[0] self._open(matrix_file.get("url")) idf_file = self._search_files("idf", "txt")[0] self._open(idf_file.get("url")) # To download if not cached return mage_tab_to_orange(os.path.join(repo_dir, idf_file.get("name")))
def fgem_to_table(self): """ Retrieve the processed matrix from the Array Express FTP server and convert it to a :class:`Orange.data.Table`. """ assert (self.fgemdatafiles) repo_dir = serverfiles.localpath("ArrayExpress", self.accession) # Find the file listing the data matrix files # (should be in sdrf but sometimes it is in 2column file only, why?) sdrf = self._search_files("sdrf", "txt") if sdrf: sdrf = SampleDataRelationship( io.TextIOWrapper(self._open(sdrf[0].get("url")), encoding="utf-8"), ) if "Derived Array Data Matrix File" not in sdrf.header: twocol = self._search_files("twocolumn", "txt") if twocol: sdrf = SampleDataRelationship( io.TextIOWrapper(self._open(twocol[0].get("url")), encoding="utf-8")) matrix_file = self._search_files("fgem")[0] self._open(matrix_file.get("url")) idf_file = self._search_files("idf", "txt")[0] self._open(idf_file.get("url")) # To download if not cached return mage_tab_to_orange(os.path.join(repo_dir, idf_file.get("name")))
def _download_file(self, url, extract=True): """ Download the `file` from the ArrayExpress into a local repository directory. """ rest, basename = posixpath.split(url) dirname = posixpath.basename(rest) repo_dir = serverfiles.localpath("ArrayExpress", dirname) try: os.makedirs(repo_dir) except OSError: pass stream = urlopen(url) local_filename = os.path.join(repo_dir, basename) shutil.copyfileobj(stream, open(local_filename, "wb")) if extract: _, extension = os.path.splitext(local_filename) if extension == ".zip": import zipfile zfile = zipfile.ZipFile(local_filename) zfile.extractall(repo_dir) elif extension == ".gz": import gzip gzfile = gzip.open(local_filename) gzfile.extractall(repo_dir) elif extension in [".tgz"]: import tarfile tfile = tarfile.TarFile(local_filename) tfile.extractall(repo_dir) elif extension == ".txt": pass else: raise ValueError("Unknown extension ('{0}').".format(basename))
def _local_filepath(self, url): """ Return the local file path for url. """ rest, basename = posixpath.split(url) dirname = posixpath.basename(rest) return serverfiles.localpath("ArrayExpress", os.path.join(dirname, basename))
def updateInfo(self): gds_info = self.gds_info text = ("%i datasets\n%i datasets cached\n" % (len(gds_info), len(glob.glob(serverfiles.localpath("GEO") + "/GDS*")))) filtered = self.treeWidget.model().rowCount() if len(self.gds) != filtered: text += ("%i after filtering") % filtered self.infoBox.setText(text)
def ParseTaxdumpFile(file=None, outputdir=None, callback=None): import Orange.utils if file == None: so = StringIO() Orange.utils.wget("ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz", dst_obj=so) file = tarfile.open(None, "r:gz", StringIO(so.getvalue())) so.close() elif type(file) == str: file = tarfile.open(file) names = file.extractfile("names.dmp").readlines() nodes = file.extractfile("nodes.dmp").readlines() namesDict = defaultdict(list) for line in names: if not line.strip(): continue line = line.rstrip("\t\n|").split("\t|\t") id, name, unique_name, name_class = line if unique_name: namesDict[id].append((unique_name , name_class)) else: namesDict[id].append((name , name_class)) nodesDict = {} for line in nodes: if not line.strip(): continue line = line.split("\t|\t")[:3] id, parent, rank = line nodesDict[id] = (parent, rank) if outputdir == None: outputdir = serverfiles.localpath("Taxonomy") text = TextDB().create(os.path.join(outputdir, "ncbi_taxonomy.db")) info = TextDB().create(os.path.join(outputdir, "ncbi_taxonomy_inf.db")) milestones = set(range(0, len(namesDict), max(int(len(namesDict)/100), 1))) for i, (id, names) in enumerate(namesDict.items()): parent, rank = nodesDict[id] ## id, parent and rank go first entry = [id, parent, rank] ## all names and name class codes pairs follow ordered so scientific name is first names = sorted(names, key=lambda x: (not x[1] == "scientific name", x[1], x[0])) print(names) entry.extend([name for name ,class_ in names]) info_entry = [id] + [class_ for name, class_ in names] text(entry) info(info_entry) if callback and i in milestones: callback(i)
def _updateToolTip(self): state_str = self.STATE_STRINGS[self.item.state] try: diff_date = self.item.latest - self.item.local except: diff_date = None tooltip = ("State: %s\nTags: %s" % (state_str, ", ".join(tag for tag in self.item.tags if not tag.startswith("#")))) if self.item.state in [CURRENT, OUTDATED, DEPRECATED]: tooltip += ("\nFile: %s" % serverfiles.localpath(self.item.domain, self.item.filename)) if self.item.state == OUTDATED and diff_date: tooltip += ("\nServer version: %s\nStatus: old (%d days)" % (self.item.latest, diff_date.days)) else: tooltip += ("\nServer version: %s" % self.item.latest) for i in range(1, 4): self.setToolTip(i, tooltip)
def get_gds_model(progress=lambda val: None): """ Initialize and return a GDS datasets model. :param progress: A progress callback. :rval tuple: A tuple of (QStandardItemModel, geo.GDSInfo, [geo.GDS]) .. note:: The returned QStandardItemModel's thread affinity is set to the GUI thread. """ progress(1) info = geo.GDSInfo() search_keys = ["dataset_id", "title", "platform_organism", "description"] cache_dir = serverfiles.localpath(geo.DOMAIN) gds_link = "http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc={0}" pm_link = "http://www.ncbi.nlm.nih.gov/pubmed/{0}" gds_list = [] def is_cached(gds): return os.path.exists( os.path.join(cache_dir, gds["dataset_id"]) + ".soft.gz") def item(displayvalue, item_values={}): item = QStandardItem() item.setData(displayvalue, Qt.DisplayRole) for role, value in item_values.items(): item.setData(value, role) return item def gds_to_row(gds): #: Text for easier full search. search_text = " | ".join( [gds.get(key, "").lower() for key in search_keys]) row = [ item(" " if is_cached(gds) else "", {TextFilterRole: search_text}), item(gds["dataset_id"], {LinkRole: gds_link.format(gds["dataset_id"])}), item(gds["title"]), item(gds["platform_organism"]), item(len(gds["samples"])), item(gds["feature_count"]), item(gds["gene_count"]), item(len(gds["subsets"])), item( gds.get("pubmed_id", ""), { LinkRole: pm_link.format(gds["pubmed_id"]) if gds.get("pubmed_id") else None }) ] return row model = QStandardItemModel() model.setHorizontalHeaderLabels([ "", "ID", "Title", "Organism", "Samples", "Features", "Genes", "Subsets", "PubMedID" ]) progress(20) for gds in info.values(): model.appendRow(gds_to_row(gds)) gds_list.append(gds) progress(50) if QThread.currentThread() is not QCoreApplication.instance().thread(): model.moveToThread(QCoreApplication.instance().thread()) return model, info, gds_list
"Subsets", "PubMedID" ]) progress(20) for gds in info.values(): model.appendRow(gds_to_row(gds)) gds_list.append(gds) progress(50) if QThread.currentThread() is not QCoreApplication.instance().thread(): model.moveToThread(QCoreApplication.instance().thread()) return model, info, gds_list GDS_CACHE_DIR = serverfiles.localpath(geo.DOMAIN) if sys.version_info >= (3, 4): _os_replace = os.replace else: if os.name != "posix": def _os_replace(src, dst): try: os.rename(src, dst) except FileExistsError: os.remove(dst) os.rename(src) else: _os_replace = os.rename
def get_gds_model(progress=lambda val: None): """ Initialize and return a GDS datasets model. :param progress: A progress callback. :rval tuple: A tuple of (QStandardItemModel, geo.GDSInfo, [geo.GDS]) .. note:: The returned QStandardItemModel's thread affinity is set to the GUI thread. """ progress(1) info = geo.GDSInfo() search_keys = ["dataset_id", "title", "platform_organism", "description"] cache_dir = serverfiles.localpath(geo.DOMAIN) gds_link = "http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc={0}" pm_link = "http://www.ncbi.nlm.nih.gov/pubmed/{0}" gds_list = [] def is_cached(gds): return os.path.exists(os.path.join(cache_dir, gds["dataset_id"]) + ".soft.gz") def item(displayvalue, item_values={}): item = QStandardItem() item.setData(displayvalue, Qt.DisplayRole) for role, value in item_values.items(): item.setData(value, role) return item def gds_to_row(gds): #: Text for easier full search. search_text = " | ".join([gds.get(key, "").lower() for key in search_keys]) row = [ item(" " if is_cached(gds) else "", {TextFilterRole: search_text}), item(gds["dataset_id"], {LinkRole: gds_link.format(gds["dataset_id"])}), item(gds["title"]), item(gds["platform_organism"]), item(len(gds["samples"])), item(gds["feature_count"]), item(gds["gene_count"]), item(len(gds["subsets"])), item(gds.get("pubmed_id", ""), {LinkRole: pm_link.format(gds["pubmed_id"]) if gds.get("pubmed_id") else None}) ] return row model = QStandardItemModel() model.setHorizontalHeaderLabels( ["", "ID", "Title", "Organism", "Samples", "Features", "Genes", "Subsets", "PubMedID"] ) progress(20) for gds in info.values(): model.appendRow(gds_to_row(gds)) gds_list.append(gds) progress(50) if QThread.currentThread() is not QCoreApplication.instance().thread(): model.moveToThread(QCoreApplication.instance().thread()) return model, info, gds_list
"Genes", "Subsets", "PubMedID"] ) progress(20) for gds in info.values(): model.appendRow(gds_to_row(gds)) gds_list.append(gds) progress(50) if QThread.currentThread() is not QCoreApplication.instance().thread(): model.moveToThread(QCoreApplication.instance().thread()) return model, info, gds_list GDS_CACHE_DIR = serverfiles.localpath(geo.DOMAIN) if sys.version_info >= (3, 4): _os_replace = os.replace else: if os.name != "posix": def _os_replace(src, dst): try: os.rename(src, dst) except FileExistsError: os.remove(dst) os.rename(src) else: _os_replace = os.rename
class ArrayExpressConnection(object): """ Constructs and runs REST query on ArrayExpress. :param address: Address of the ArrayExpress API. :param timeout: Timeout for the connection. """ DEFAULT_ADDRESS = "http://www.ebi.ac.uk/arrayexpress/{format}/v2/" DEFAULT_FORMAT = "json" DEFAULT_CACHE = serverfiles.localpath("ArrayExpress", "ArrayExpressCache.shelve") # Order of arguments in the query _ARGS_ORDER = ["keywords", "species", "array"] def __init__(self, address=None, timeout=30, cache=None, username=None, password=None): self.address = address if address is not None else self.DEFAULT_ADDRESS self.timeout = timeout self.cache = cache if cache is not None else self.DEFAULT_CACHE self.username = username self.password = password def format_query(self, **kwargs): """Format the query arguments in `kwargs`. >>> conn.format_query(gxa=True, efcount=(1, 5)) 'efcount=[1 TO 5]&gxa=true' """ # Formaters: def format_default(val): if isinstance(val, six.string_types): return val else: return "+".join(val) def format_species(val): return '"%s"' % val.lower() def format_gxa(val): if val: return "true" else: raise ValueError("gxa={0}".format(val)) def format_expandefo(val): if val: return "on" else: raise ValueError("expandefo={0}".format(val)) def format_true_false(val): return "true" if val else "false" def format_interval(val): if isinstance(val, tuple): return "[{0} TO {1}]".format(*val) else: raise ValueError("Must be an interval argument (min, max)!") def format_date(val): # TODO check if val contains a datetime.date object # assert proper format return format_interval(val) def format_wholewords(val): if val: return "on" else: raise ValueError("wholewords={0}".format(val)) formaters = { "species": format_species, "gxa": format_gxa, "expandefo": format_expandefo, "directsub": format_true_false, "assaycount": format_interval, "efcount": format_interval, "samplecount": format_interval, "sacount": format_interval, "rawcount": format_interval, "fgemcount": format_interval, "miamescore": format_interval, "date": format_date, "wholewords": format_wholewords, } parts = [] arg_items = sorted(kwargs.items()) arg_items = sorted(arg_items, key=lambda arg: self._ARGS_ORDER.index(arg[0]) if arg[0] in self._ARGS_ORDER else 100) for key, value in arg_items: if key == "format": continue # format is handled in query_url if key not in ARRAYEXPRESS_FIELDS: raise ValueError("Invalid argument name: '{0}'".format(key)) if value is not None and value != []: fmt = formaters.get(key, format_default) value = fmt(value) parts.append("{0}={1}".format(key, value)) return "&".join(parts) def query_url(self, what="experiments", **kwargs): """Return a formatted query URL for the query arguments. >>> conn.query_url(accession="E-MEXP-31") 'http://www.ebi.ac.uk/arrayexpress/json/v2/experiments?accession=E-MEXP-31' """ query = self.format_query(**kwargs) url = posixpath.join(self.address, what) url = url.format(format=kwargs.get("format", self.DEFAULT_FORMAT)) url = url + ("?" + query if query else "") url = url.replace(" ", "%20") return url def query_url_experiments(self, **kwargs): """Return query URL of formatted experiments for the query arguments. """ return self.query_url("experiments", **kwargs) def query_url_files(self, **kwargs): """ Return query URL of formatted experiments for the query arguments. """ return self.query_url("files", **kwargs) def query_experiment(self, **kwargs): """Return an open stream to the experiments query results. Takes the same arguments as the :obj:`query_experiments` function. """ url = self.query_url_experiments(**kwargs) stream = self._cache_urlopen(url, timeout=self.timeout) return stream def query_files(self, **kwargs): """Return an open stream to the files query results. Takes the same arguments as the :obj:`query_files` function. """ url = self.query_url_files(**kwargs) stream = self._cache_urlopen(url, timeout=self.timeout) return stream def open_file(self, accession, kind="raw", ext=None): """ Return a file handle to experiment data. :param str accession: :param str kind: Experiment data type. Possible values for the parameter `kind`: - raw: return the raw data if available - processed: return the processed data if available - biosamples: a png or svg design image - idf: investigation description - adf: array design description - mageml: MAGE-ML file Example:: >>> raw_file = conn.open_file("E-TABM-1087", kind="raw") # doctest: +SKIP >>> processed_file = conn.open_file("E-TABM-1087", kind="processed") # doctest: +SKIP """ stream = self.query_files(accession=accession, format="json") data = json.load(io.TextIOWrapper(stream, encoding="utf-8")) try: files = data["files"]["experiment"]["file"] except KeyError: raise ValueError(accession) for file in files: filekind = file["kind"] fileext = file["extension"] if (filekind == kind) and (fileext == ext or ext is None): url = file["url"] return self._cache_urlopen(str(url), timeout=self.timeout) raise ValueError("%s does not have a file of kind: %r" % (accession, kind)) def _cache_urlopen(self, url, timeout=30): if self.cache is not None: with self.open_cache("r") as cache: if url in cache: return io.BytesIO(cache[url]) stream = urlopen(url, timeout=timeout) data = stream.read() with self.open_cache("w") as cache: cache[url] = data return io.BytesIO(data) else: return urlopen(url, timeout=timeout) def open_cache(self, flag="r"): if isinstance(self.cache, six.string_types): try: return closing(_open_shelve(self.cache, flag)) except Exception: return _fake_closing({}) elif hasattr(self.cache, "close"): return closing(self.cache) elif self.cache is None: return _fake_closing({}) else: return _fake_closing(self.cache)