def __stream_dataset_collection(self, trans, dataset_collection_instance): archive_type_string = 'w|gz' archive_ext = 'tgz' if self.app.config.upstream_gzip: archive_type_string = 'w|' archive_ext = 'tar' archive = StreamBall(mode=archive_type_string) names, hdas = get_hda_and_element_identifiers( dataset_collection_instance) for name, hda in zip(names, hdas): if hda.state != hda.states.OK: continue for file_path, relpath in hda.datatype.to_archive(trans=trans, dataset=hda, name=name): archive.add(file=file_path, relpath=relpath) archive_name = "%s: %s.%s" % (dataset_collection_instance.hid, dataset_collection_instance.name, archive_ext) trans.response.set_content_type("application/x-tar") trans.response.headers[ "Content-Disposition"] = 'attachment; filename="{}"'.format( archive_name) archive.wsgi_status = trans.response.wsgi_status() archive.wsgi_headeritems = trans.response.wsgi_headeritems() return archive.stream
def __stream_dataset_collection(self, trans, dataset_collection_instance): archive_type_string = 'w|gz' archive_ext = 'tgz' if self.app.config.upstream_gzip: archive_type_string = 'w|' archive_ext = 'tar' archive = StreamBall(mode=archive_type_string) names, hdas = get_hda_and_element_identifiers(dataset_collection_instance) for name, hda in zip(names, hdas): if hda.state != hda.states.OK: continue for file_path, relpath in hda.datatype.to_archive(trans=trans, dataset=hda, name=name): archive.add(file=file_path, relpath=relpath) archive_name = "%s: %s.%s" % (dataset_collection_instance.hid, dataset_collection_instance.name, archive_ext) trans.response.set_content_type("application/x-tar") trans.response.headers["Content-Disposition"] = 'attachment; filename="{}"'.format(archive_name) archive.wsgi_status = trans.response.wsgi_status() archive.wsgi_headeritems = trans.response.wsgi_headeritems() return archive.stream
break path = os.path.join( parent_folder.name, path ) parent_folder = parent_folder.parent path += ldda.name while path in seen: path += '_' seen.append( path ) zpath = os.path.split(path)[ -1 ] # comes as base_name/fname outfname, zpathext = os.path.splitext( zpath ) if is_composite: # need to add all the components from the extra_files_path to the zip if zpathext == '': zpath = '%s.html' % zpath # fake the real nature of the html file try: if format == 'zip': archive.add( ldda.dataset.file_name, zpath ) # add the primary of a composite set else: archive.add( ldda.dataset.file_name, zpath, check_file=True ) # add the primary of a composite set except IOError: log.exception( "Unable to add composite parent %s to temporary library download archive" % ldda.dataset.file_name ) raise exceptions.InternalServerError( "Unable to create archive for download." ) except ObjectNotFound: log.exception( "Requested dataset %s does not exist on the host." % ldda.dataset.file_name ) raise exceptions.ObjectNotFound( "Requested dataset not found. " ) except Exception, e: log.exception( "Unable to add composite parent %s to temporary library download archive" % ldda.dataset.file_name ) raise exceptions.InternalServerError( "Unable to add composite parent to temporary library download archive. " + str( e ) ) flist = glob.glob(os.path.join(ldda.dataset.extra_files_path, '*.*')) # glob returns full paths for fpath in flist: efp, fname = os.path.split(fpath)
path = os.path.join(parent_folder.name, path) parent_folder = parent_folder.parent path += ldda.name while path in seen: path += '_' seen.append(path) zpath = os.path.split(path)[-1] # comes as base_name/fname outfname, zpathext = os.path.splitext(zpath) if is_composite: # need to add all the components from the extra_files_path to the zip if zpathext == '': zpath = '%s.html' % zpath # fake the real nature of the html file try: if format == 'zip': archive.add( ldda.dataset.file_name, zpath) # add the primary of a composite set else: archive.add(ldda.dataset.file_name, zpath, check_file=True ) # add the primary of a composite set except IOError: log.exception( "Unable to add composite parent %s to temporary library download archive" % ldda.dataset.file_name) raise exceptions.InternalServerError( "Unable to create archive for download.") except ObjectNotFound: log.exception( "Requested dataset %s does not exist on the host."
def download(self, trans, format, **kwd): """ Download requested datasets (identified by encoded IDs) in requested format. * GET /api/libraries/datasets/download/{format} * POST /api/libraries/datasets/download/{format} example: ``GET localhost:8080/api/libraries/datasets/download/tbz?ld_ids%255B%255D=a0d84b45643a2678&ld_ids%255B%255D=fe38c84dcd46c828`` .. note:: supported format values are: 'zip', 'tgz', 'tbz', 'uncompressed' :param format: string representing requested archive format :type format: string :param ld_ids[]: an array of encoded dataset ids :type ld_ids[]: an array :param folder_ids[]: an array of encoded folder ids :type folder_ids[]: an array :returns: either archive with the requested datasets packed inside or a single uncompressed dataset :rtype: file :raises: MessageException, ItemDeletionException, ItemAccessibilityException, HTTPBadRequest, OSError, IOError, ObjectNotFound """ library_datasets = [] datasets_to_download = kwd.get('ld_ids%5B%5D', None) if datasets_to_download is None: datasets_to_download = kwd.get('ld_ids', None) if datasets_to_download is not None: datasets_to_download = util.listify(datasets_to_download) for dataset_id in datasets_to_download: try: library_dataset = self.get_library_dataset(trans, id=dataset_id, check_ownership=False, check_accessible=True) library_datasets.append(library_dataset) except HTTPBadRequest: raise exceptions.RequestParameterInvalidException('Bad Request.') except HTTPInternalServerError: raise exceptions.InternalServerError('Internal error.') except Exception as e: raise exceptions.InternalServerError('Unknown error.' + str(e)) folders_to_download = kwd.get('folder_ids%5B%5D', None) if folders_to_download is None: folders_to_download = kwd.get('folder_ids', None) if folders_to_download is not None: folders_to_download = util.listify(folders_to_download) current_user_roles = trans.get_current_user_roles() def traverse(folder): admin = trans.user_is_admin() rval = [] for subfolder in folder.active_folders: if not admin: can_access, folder_ids = trans.app.security_agent.check_folder_contents(trans.user, current_user_roles, subfolder) if (admin or can_access) and not subfolder.deleted: rval.extend(traverse(subfolder)) for ld in folder.datasets: if not admin: can_access = trans.app.security_agent.can_access_dataset( current_user_roles, ld.library_dataset_dataset_association.dataset ) if (admin or can_access) and not ld.deleted: rval.append(ld) return rval for encoded_folder_id in folders_to_download: folder_id = self.folder_manager.cut_and_decode(trans, encoded_folder_id) folder = self.folder_manager.get(trans, folder_id) library_datasets.extend(traverse(folder)) if not library_datasets: raise exceptions.RequestParameterMissingException('Request has to contain a list of dataset ids or folder ids to download.') if format in ['zip', 'tgz', 'tbz']: # error = False killme = string.punctuation + string.whitespace trantab = string.maketrans(killme, '_' * len(killme)) try: outext = 'zip' if format == 'zip': # Can't use mkstemp - the file must not exist first tmpd = tempfile.mkdtemp() util.umask_fix_perms(tmpd, trans.app.config.umask, 0777, self.app.config.gid) tmpf = os.path.join(tmpd, 'library_download.' + format) if trans.app.config.upstream_gzip: archive = zipfile.ZipFile(tmpf, 'w', zipfile.ZIP_STORED, True) else: archive = zipfile.ZipFile(tmpf, 'w', zipfile.ZIP_DEFLATED, True) archive.add = lambda x, y: archive.write(x, y.encode('CP437')) elif format == 'tgz': if trans.app.config.upstream_gzip: archive = StreamBall('w|') outext = 'tar' else: archive = StreamBall('w|gz') outext = 'tgz' elif format == 'tbz': archive = StreamBall('w|bz2') outext = 'tbz2' except (OSError, zipfile.BadZipfile): log.exception("Unable to create archive for download") raise exceptions.InternalServerError("Unable to create archive for download.") except Exception: log.exception("Unexpected error in create archive for download") raise exceptions.InternalServerError("Unable to create archive for download.") composite_extensions = trans.app.datatypes_registry.get_composite_extensions() seen = [] for ld in library_datasets: ldda = ld.library_dataset_dataset_association ext = ldda.extension is_composite = ext in composite_extensions path = "" parent_folder = ldda.library_dataset.folder while parent_folder is not None: # Exclude the now-hidden "root folder" if parent_folder.parent is None: path = os.path.join(parent_folder.library_root[0].name, path) break path = os.path.join(parent_folder.name, path) parent_folder = parent_folder.parent path += ldda.name while path in seen: path += '_' seen.append(path) zpath = os.path.split(path)[-1] # comes as base_name/fname outfname, zpathext = os.path.splitext(zpath) if is_composite: # need to add all the components from the extra_files_path to the zip if zpathext == '': zpath = '%s.html' % zpath # fake the real nature of the html file try: if format == 'zip': archive.add(ldda.dataset.file_name, zpath) # add the primary of a composite set else: archive.add(ldda.dataset.file_name, zpath, check_file=True) # add the primary of a composite set except IOError: log.exception("Unable to add composite parent %s to temporary library download archive", ldda.dataset.file_name) raise exceptions.InternalServerError("Unable to create archive for download.") except ObjectNotFound: log.exception("Requested dataset %s does not exist on the host.", ldda.dataset.file_name) raise exceptions.ObjectNotFound("Requested dataset not found. ") except Exception as e: log.exception("Unable to add composite parent %s to temporary library download archive", ldda.dataset.file_name) raise exceptions.InternalServerError("Unable to add composite parent to temporary library download archive. " + str(e)) flist = glob.glob(os.path.join(ldda.dataset.extra_files_path, '*.*')) # glob returns full paths for fpath in flist: efp, fname = os.path.split(fpath) if fname > '': fname = fname.translate(trantab) try: if format == 'zip': archive.add(fpath, fname) else: archive.add(fpath, fname, check_file=True) except IOError: log.exception("Unable to add %s to temporary library download archive %s", fname, outfname) raise exceptions.InternalServerError("Unable to create archive for download.") except ObjectNotFound: log.exception("Requested dataset %s does not exist on the host.", fpath) raise exceptions.ObjectNotFound("Requested dataset not found.") except Exception as e: log.exception("Unable to add %s to temporary library download archive %s" % (fname, outfname)) raise exceptions.InternalServerError("Unable to add dataset to temporary library download archive . " + str(e)) else: try: if format == 'zip': archive.add(ldda.dataset.file_name, path) else: archive.add(ldda.dataset.file_name, path, check_file=True) except IOError: log.exception("Unable to write %s to temporary library download archive", ldda.dataset.file_name) raise exceptions.InternalServerError("Unable to create archive for download") except ObjectNotFound: log.exception("Requested dataset %s does not exist on the host.", ldda.dataset.file_name) raise exceptions.ObjectNotFound("Requested dataset not found.") except Exception as e: log.exception("Unable to add %s to temporary library download archive %s", fname, outfname) raise exceptions.InternalServerError("Unknown error. " + str(e)) lname = 'selected_dataset' fname = lname.replace(' ', '_') + '_files' if format == 'zip': archive.close() trans.response.set_content_type("application/octet-stream") trans.response.headers["Content-Disposition"] = 'attachment; filename="%s.%s"' % (fname, outext) archive = util.streamball.ZipBall(tmpf, tmpd) archive.wsgi_status = trans.response.wsgi_status() archive.wsgi_headeritems = trans.response.wsgi_headeritems() return archive.stream else: trans.response.set_content_type("application/x-tar") trans.response.headers["Content-Disposition"] = 'attachment; filename="%s.%s"' % (fname, outext) archive.wsgi_status = trans.response.wsgi_status() archive.wsgi_headeritems = trans.response.wsgi_headeritems() return archive.stream elif format == 'uncompressed': if len(library_datasets) != 1: raise exceptions.RequestParameterInvalidException("You can download only one uncompressed file at once.") else: single_ld = library_datasets[0] ldda = single_ld.library_dataset_dataset_association dataset = ldda.dataset fStat = os.stat(dataset.file_name) trans.response.set_content_type(ldda.get_mime()) trans.response.headers['Content-Length'] = int(fStat.st_size) fname = ldda.name fname = ''.join(c in util.FILENAME_VALID_CHARS and c or '_' for c in fname)[0:150] trans.response.headers["Content-Disposition"] = 'attachment; filename="%s"' % fname try: return open(dataset.file_name) except: raise exceptions.InternalServerError("This dataset contains no content.") else: raise exceptions.RequestParameterInvalidException("Wrong format parameter specified")
def download(self, trans, format, **kwd): """ GET /api/libraries/datasets/download/{format} POST /api/libraries/datasets/download/{format} Download requested datasets (identified by encoded IDs) in requested format. example: ``GET localhost:8080/api/libraries/datasets/download/tbz?ld_ids%255B%255D=a0d84b45643a2678&ld_ids%255B%255D=fe38c84dcd46c828`` .. note:: supported format values are: 'zip', 'tgz', 'tbz', 'uncompressed' :param format: string representing requested archive format :type format: string :param ld_ids[]: an array of encoded dataset ids :type ld_ids[]: an array :param folder_ids[]: an array of encoded folder ids :type folder_ids[]: an array :returns: either archive with the requested datasets packed inside or a single uncompressed dataset :rtype: file :raises: MessageException, ItemDeletionException, ItemAccessibilityException, HTTPBadRequest, OSError, IOError, ObjectNotFound """ library_datasets = [] datasets_to_download = kwd.get('ld_ids%5B%5D', None) if datasets_to_download is None: datasets_to_download = kwd.get('ld_ids', None) if datasets_to_download is not None: datasets_to_download = util.listify(datasets_to_download) for dataset_id in datasets_to_download: try: library_dataset = self.get_library_dataset( trans, id=dataset_id, check_ownership=False, check_accessible=True) library_datasets.append(library_dataset) except HTTPBadRequest: raise exceptions.RequestParameterInvalidException( 'Bad Request.') except HTTPInternalServerError: raise exceptions.InternalServerError('Internal error.') except Exception as e: raise exceptions.InternalServerError('Unknown error.' + util.unicodify(e)) folders_to_download = kwd.get('folder_ids%5B%5D', None) if folders_to_download is None: folders_to_download = kwd.get('folder_ids', None) if folders_to_download is not None: folders_to_download = util.listify(folders_to_download) current_user_roles = trans.get_current_user_roles() def traverse(folder): admin = trans.user_is_admin rval = [] for subfolder in folder.active_folders: if not admin: can_access, folder_ids = trans.app.security_agent.check_folder_contents( trans.user, current_user_roles, subfolder) if (admin or can_access) and not subfolder.deleted: rval.extend(traverse(subfolder)) for ld in folder.datasets: if not admin: can_access = trans.app.security_agent.can_access_dataset( current_user_roles, ld.library_dataset_dataset_association.dataset) if (admin or can_access) and not ld.deleted: rval.append(ld) return rval for encoded_folder_id in folders_to_download: folder_id = self.folder_manager.cut_and_decode( trans, encoded_folder_id) folder = self.folder_manager.get(trans, folder_id) library_datasets.extend(traverse(folder)) if not library_datasets: raise exceptions.RequestParameterMissingException( 'Request has to contain a list of dataset ids or folder ids to download.' ) if format in ['zip', 'tgz', 'tbz']: # error = False killme = string.punctuation + string.whitespace trantab = string.maketrans(killme, '_' * len(killme)) try: outext = 'zip' if format == 'zip': # Can't use mkstemp - the file must not exist first tmpd = tempfile.mkdtemp() util.umask_fix_perms(tmpd, trans.app.config.umask, 0o777, self.app.config.gid) tmpf = os.path.join(tmpd, 'library_download.' + format) if trans.app.config.upstream_gzip: archive = zipfile.ZipFile(tmpf, 'w', zipfile.ZIP_STORED, True) else: archive = zipfile.ZipFile(tmpf, 'w', zipfile.ZIP_DEFLATED, True) archive.add = lambda x, y: archive.write( x, y.encode('CP437')) elif format == 'tgz': if trans.app.config.upstream_gzip: archive = StreamBall('w|') outext = 'tar' else: archive = StreamBall('w|gz') outext = 'tgz' elif format == 'tbz': archive = StreamBall('w|bz2') outext = 'tbz2' except (OSError, zipfile.BadZipfile): log.exception("Unable to create archive for download") raise exceptions.InternalServerError( "Unable to create archive for download.") except Exception: log.exception( "Unexpected error in create archive for download") raise exceptions.InternalServerError( "Unable to create archive for download.") composite_extensions = trans.app.datatypes_registry.get_composite_extensions( ) seen = [] for ld in library_datasets: ldda = ld.library_dataset_dataset_association ext = ldda.extension is_composite = ext in composite_extensions path = "" parent_folder = ldda.library_dataset.folder while parent_folder is not None: # Exclude the now-hidden "root folder" if parent_folder.parent is None: path = os.path.join(parent_folder.library_root[0].name, path) break path = os.path.join(parent_folder.name, path) parent_folder = parent_folder.parent path += ldda.name while path in seen: path += '_' path = "{path}.{extension}".format(path=path, extension=ldda.extension) seen.append(path) zpath = os.path.split(path)[-1] # comes as base_name/fname outfname, zpathext = os.path.splitext(zpath) if is_composite: # need to add all the components from the extra_files_path to the zip if zpathext == '': zpath = '%s.html' % zpath # fake the real nature of the html file try: if format == 'zip': archive.add( ldda.dataset.file_name, zpath) # add the primary of a composite set else: archive.add(ldda.dataset.file_name, zpath, check_file=True ) # add the primary of a composite set except IOError: log.exception( "Unable to add composite parent %s to temporary library download archive", ldda.dataset.file_name) raise exceptions.InternalServerError( "Unable to create archive for download.") except ObjectNotFound: log.exception( "Requested dataset %s does not exist on the host.", ldda.dataset.file_name) raise exceptions.ObjectNotFound( "Requested dataset not found. ") except Exception as e: log.exception( "Unable to add composite parent %s to temporary library download archive", ldda.dataset.file_name) raise exceptions.InternalServerError( "Unable to add composite parent to temporary library download archive. " + util.unicodify(e)) flist = glob.glob( os.path.join(ldda.dataset.extra_files_path, '*.*')) # glob returns full paths for fpath in flist: efp, fname = os.path.split(fpath) if fname > '': fname = fname.translate(trantab) try: if format == 'zip': archive.add(fpath, fname) else: archive.add(fpath, fname, check_file=True) except IOError: log.exception( "Unable to add %s to temporary library download archive %s", fname, outfname) raise exceptions.InternalServerError( "Unable to create archive for download.") except ObjectNotFound: log.exception( "Requested dataset %s does not exist on the host.", fpath) raise exceptions.ObjectNotFound( "Requested dataset not found.") except Exception as e: log.exception( "Unable to add %s to temporary library download archive %s", fname, outfname) raise exceptions.InternalServerError( "Unable to add dataset to temporary library download archive . " + util.unicodify(e)) else: try: if format == 'zip': archive.add(ldda.dataset.file_name, path) else: archive.add(ldda.dataset.file_name, path, check_file=True) except IOError: log.exception( "Unable to write %s to temporary library download archive", ldda.dataset.file_name) raise exceptions.InternalServerError( "Unable to create archive for download") except ObjectNotFound: log.exception( "Requested dataset %s does not exist on the host.", ldda.dataset.file_name) raise exceptions.ObjectNotFound( "Requested dataset not found.") except Exception as e: log.exception( "Unable to add %s to temporary library download archive %s", ldda.dataset.file_name, outfname) raise exceptions.InternalServerError( "Unknown error. " + util.unicodify(e)) lname = 'selected_dataset' fname = lname.replace(' ', '_') + '_files' if format == 'zip': archive.close() trans.response.set_content_type("application/octet-stream") trans.response.headers[ "Content-Disposition"] = 'attachment; filename="%s.%s"' % ( fname, outext) archive = util.streamball.ZipBall(tmpf, tmpd) archive.wsgi_status = trans.response.wsgi_status() archive.wsgi_headeritems = trans.response.wsgi_headeritems() return archive.stream else: trans.response.set_content_type("application/x-tar") trans.response.headers[ "Content-Disposition"] = 'attachment; filename="%s.%s"' % ( fname, outext) archive.wsgi_status = trans.response.wsgi_status() archive.wsgi_headeritems = trans.response.wsgi_headeritems() return archive.stream elif format == 'uncompressed': if len(library_datasets) != 1: raise exceptions.RequestParameterInvalidException( "You can download only one uncompressed file at once.") else: single_ld = library_datasets[0] ldda = single_ld.library_dataset_dataset_association dataset = ldda.dataset fStat = os.stat(dataset.file_name) trans.response.set_content_type(ldda.get_mime()) trans.response.headers['Content-Length'] = int(fStat.st_size) fname = "{path}.{extension}".format(path=ldda.name, extension=ldda.extension) fname = ''.join(c in util.FILENAME_VALID_CHARS and c or '_' for c in fname)[0:150] trans.response.headers[ "Content-Disposition"] = 'attachment; filename="%s"' % fname try: return open(dataset.file_name, 'rb') except Exception: raise exceptions.InternalServerError( "This dataset contains no content.") else: raise exceptions.RequestParameterInvalidException( "Wrong format parameter specified")
def archive(self, trans, history_id, filename='', format='tgz', dry_run=True, **kwd): """ archive( self, trans, history_id, filename='', format='tgz', dry_run=True, **kwd ) * GET /api/histories/{history_id}/contents/archive/{id} * GET /api/histories/{history_id}/contents/archive/{filename}.{format} build and return a compressed archive of the selected history contents :type filename: string :param filename: (optional) archive name (defaults to history name) :type dry_run: boolean :param dry_run: (optional) if True, return the archive and file paths only as json and not an archive file :returns: archive file for download .. note:: this is a volatile endpoint and settings and behavior may change. """ # roughly from: http://stackoverflow.com/a/31976060 (windows, linux) invalid_filename_char_regex = re.compile(r'[:<>|\\\/\?\* "]') # path format string - dot separator between id and name id_name_format = u'{}.{}' def name_to_filename(name, max_length=150, replace_with=u'_'): # TODO: seems like shortening unicode with [:] would cause unpredictable display strings return invalid_filename_char_regex.sub(replace_with, name)[0:max_length] # given a set of parents for a dataset (HDCAs, DC, DCEs, etc.) - build a directory structure that # (roughly) recreates the nesting in the contents using the parent names and ids def build_path_from_parents(parents): parent_names = [] for parent in parents: # an HDCA if hasattr(parent, 'hid'): name = name_to_filename(parent.name) parent_names.append(id_name_format.format(parent.hid, name)) # a DCE elif hasattr(parent, 'element_index'): name = name_to_filename(parent.element_identifier) parent_names.append(id_name_format.format(parent.element_index, name)) # NOTE: DCs are skipped and use the wrapping DCE info instead return parent_names # get the history used for the contents query and check for accessibility history = self.history_manager.get_accessible(trans.security.decode_id(history_id), trans.user) archive_base_name = filename or name_to_filename(history.name) # this is the fn applied to each dataset contained in the query paths_and_files = [] def build_archive_files_and_paths(content, *parents): archive_path = archive_base_name if not self.hda_manager.is_accessible(content, trans.user): # if the underlying dataset is not accessible, skip it silently return content_container_id = content.hid content_name = name_to_filename(content.name) if parents: if hasattr(parents[0], 'element_index'): # if content is directly wrapped in a DCE, strip it from parents (and the resulting path) # and instead replace the content id and name with the DCE index and identifier parent_dce, parents = parents[0], parents[1:] content_container_id = parent_dce.element_index content_name = name_to_filename(parent_dce.element_identifier) # reverse for path from parents: oldest parent first archive_path = os.path.join(archive_path, *build_path_from_parents(parents)[::-1]) # TODO: this is brute force - building the path each time instead of re-using it # possibly cache # add the name as the last element in the archive path content_id_and_name = id_name_format.format(content_container_id, content_name) archive_path = os.path.join(archive_path, content_id_and_name) # ---- for composite files, we use id and name for a directory and, inside that, ... if self.hda_manager.is_composite(content): # ...save the 'main' composite file (gen. html) paths_and_files.append((content.file_name, os.path.join(archive_path, content.name + '.html'))) for extra_file in self.hda_manager.extra_files(content): extra_file_basename = os.path.basename(extra_file) archive_extra_file_path = os.path.join(archive_path, extra_file_basename) # ...and one for each file in the composite paths_and_files.append((extra_file, archive_extra_file_path)) # ---- for single files, we add the true extension to id and name and store that single filename else: # some dataset names can contain their original file extensions, don't repeat if not archive_path.endswith('.' + content.extension): archive_path += '.' + content.extension paths_and_files.append((content.file_name, archive_path)) # filter the contents that contain datasets using any filters possible from index above and map the datasets filter_params = self.parse_filter_params(kwd) filters = self.history_contents_filters.parse_filters(filter_params) self.history_contents_manager.map_datasets(history, build_archive_files_and_paths, filters=filters) # if dry_run, return the structure as json for debugging if dry_run == 'True': trans.response.headers['Content-Type'] = 'application/json' return safe_dumps(paths_and_files) # create the archive, add the dataset files, then stream the archive as a download archive_type_string = 'w|gz' archive_ext = 'tgz' if self.app.config.upstream_gzip: archive_type_string = 'w|' archive_ext = 'tar' archive = StreamBall(archive_type_string) for file_path, archive_path in paths_and_files: archive.add(file_path, archive_path) archive_name = '.'.join([archive_base_name, archive_ext]) trans.response.set_content_type("application/x-tar") trans.response.headers["Content-Disposition"] = 'attachment; filename="{}"'.format(archive_name) archive.wsgi_status = trans.response.wsgi_status() archive.wsgi_headeritems = trans.response.wsgi_headeritems() return archive.stream
def archive(self, trans, history_id, filename='', format='tgz', dry_run=True, **kwd): """ archive( self, trans, history_id, filename='', format='tgz', dry_run=True, **kwd ) * GET /api/histories/{history_id}/contents/archive/{id} * GET /api/histories/{history_id}/contents/archive/{filename}.{format} build and return a compressed archive of the selected history contents :type filename: string :param filename: (optional) archive name (defaults to history name) :type dry_run: boolean :param dry_run: (optional) if True, return the archive and file paths only as json and not an archive file :returns: archive file for download .. note: this is a volatile endpoint and settings and behavior may change. """ # roughly from: http://stackoverflow.com/a/31976060 (windows, linux) invalid_filename_char_regex = re.compile(r'[:<>|\\\/\?\* "]') # path format string - dot separator between id and name id_name_format = u'{}.{}' def name_to_filename(name, max_length=150, replace_with=u'_'): # TODO: seems like shortening unicode with [:] would cause unpredictable display strings return invalid_filename_char_regex.sub(replace_with, name)[0:max_length] # given a set of parents for a dataset (HDCAs, DC, DCEs, etc.) - build a directory structure that # (roughly) recreates the nesting in the contents using the parent names and ids def build_path_from_parents(parents): parent_names = [] for parent in parents: # an HDCA if hasattr(parent, 'hid'): name = name_to_filename(parent.name) parent_names.append(id_name_format.format( parent.hid, name)) # a DCE elif hasattr(parent, 'element_index'): name = name_to_filename(parent.element_identifier) parent_names.append( id_name_format.format(parent.element_index, name)) # NOTE: DCs are skipped and use the wrapping DCE info instead return parent_names # get the history used for the contents query and check for accessibility history = self.history_manager.get_accessible( trans.security.decode_id(history_id), trans.user) archive_base_name = filename or name_to_filename(history.name) # this is the fn applied to each dataset contained in the query paths_and_files = [] def build_archive_files_and_paths(content, *parents): archive_path = archive_base_name if not self.hda_manager.is_accessible(content, trans.user): # if the underlying dataset is not accessible, skip it silently return content_container_id = content.hid content_name = name_to_filename(content.name) if parents: if hasattr(parents[0], 'element_index'): # if content is directly wrapped in a DCE, strip it from parents (and the resulting path) # and instead replace the content id and name with the DCE index and identifier parent_dce, parents = parents[0], parents[1:] content_container_id = parent_dce.element_index content_name = name_to_filename( parent_dce.element_identifier) # reverse for path from parents: oldest parent first archive_path = os.path.join( archive_path, *build_path_from_parents(parents)[::-1]) # TODO: this is brute force - building the path each time instead of re-using it # possibly cache # add the name as the last element in the archive path content_id_and_name = id_name_format.format( content_container_id, content_name) archive_path = os.path.join(archive_path, content_id_and_name) # ---- for composite files, we use id and name for a directory and, inside that, ... if self.hda_manager.is_composite(content): # ...save the 'main' composite file (gen. html) paths_and_files.append((content.file_name, os.path.join(archive_path, content.name + '.html'))) for extra_file in self.hda_manager.extra_files(content): extra_file_basename = os.path.basename(extra_file) archive_extra_file_path = os.path.join( archive_path, extra_file_basename) # ...and one for each file in the composite paths_and_files.append( (extra_file, archive_extra_file_path)) # ---- for single files, we add the true extension to id and name and store that single filename else: # some dataset names can contain their original file extensions, don't repeat if not archive_path.endswith('.' + content.extension): archive_path += '.' + content.extension paths_and_files.append((content.file_name, archive_path)) # filter the contents that contain datasets using any filters possible from index above and map the datasets filter_params = self.parse_filter_params(kwd) filters = self.history_contents_filters.parse_filters(filter_params) self.history_contents_manager.map_datasets( history, build_archive_files_and_paths, filters=filters) # if dry_run, return the structure as json for debugging if dry_run == 'True': trans.response.headers['Content-Type'] = 'application/json' return safe_dumps(paths_and_files) # create the archive, add the dataset files, then stream the archive as a download archive_type_string = 'w|gz' archive_ext = 'tgz' if self.app.config.upstream_gzip: archive_type_string = 'w|' archive_ext = 'tar' archive = StreamBall(archive_type_string) for file_path, archive_path in paths_and_files: archive.add(file_path, archive_path) archive_name = '.'.join([archive_base_name, archive_ext]) trans.response.set_content_type("application/x-tar") trans.response.headers[ "Content-Disposition"] = 'attachment; filename="{}"'.format( archive_name) archive.wsgi_status = trans.response.wsgi_status() archive.wsgi_headeritems = trans.response.wsgi_headeritems() return archive.stream