def download_chunk(meta, cache, cloudpath, mip, filename, fill_missing, enable_cache, compress_cache, secrets, background_color): (file, ) = CloudFiles(cloudpath, secrets=secrets).get([filename], raw=True) content = file['content'] if enable_cache: cache_content = next(compression.transcode(file, compress_cache))['content'] CloudFiles('file://' + cache.path).put( path=filename, content=(cache_content or b''), content_type=content_type(meta.encoding(mip)), compress=compress_cache, raw=bool(cache_content), ) del cache_content if content is not None: content = compression.decompress(content, file['compress']) bbox = Bbox.from_filename( filename) # possible off by one error w/ exclusive bounds img3d = decode(meta, filename, content, fill_missing, mip, background_color=background_color) return img3d, bbox
def upload(self, files, compress, cache_control, compress_level=None, content_type=None, progress=None): files = list(files) progress = progress if progress is not None else self.config.progress cf = CloudFiles(self.meta.cloudpath, progress=progress, secrets=self.config.secrets) files = list( compression.transcode(files, encoding=compress, level=compress_level)) cf.puts( files, compress=compress, compression_level=compress_level, cache_control=cache_control, content_type=content_type, raw=True, ) if self.enabled: self.put(files, compress=compress) cf_cache = CloudFiles('file://' + self.path, progress=('to Cache' if progress else None)) cf_cache.puts(files, compress=compress, raw=True)
def test_transcode(dest_encoding): from cloudfiles import CloudFiles, compression base_text = b'hello world' encodings = [None, "gzip", "br", "zstd"] varied_texts = [] ans = compression.compress(base_text, dest_encoding) for i in range(200): src_encoding = encodings[i % len(encodings)] varied_texts.append({ "path": str(i), "content": compression.compress(base_text, src_encoding), "raw": src_encoding is not None, "compress": src_encoding, }) transcoded = (x['content'] for x in compression.transcode(varied_texts, dest_encoding)) for content in transcoded: assert content == ans
def download(self, paths, compress=None, progress=None): """ Download the provided paths, but grab them from cache first if they are present and the cache is enabled. Returns: { filename: content, ... } """ if len(paths) == 0: return {} progress = nvl(progress, self.config.progress) compress = nvl(compress, self.compress, self.config.compress) locs = self.compute_data_locations(paths) locs['remote'] = [str(x) for x in locs['remote']] fragments = {} if self.enabled: fragments = self.get(locs['local'], progress=progress) # fixes e.g. mesh\info -> mesh/info on Windows if self.meta.path.protocol != 'file' and os.path.sep == '\\': fragments = { "/".join(key.split('\\')): val for key, val in fragments.items() } cf = CloudFiles( self.meta.cloudpath, progress=progress, secrets=self.config.secrets, parallel=self.config.parallel, ) remote_fragments = cf.get(locs['remote'], raw=True) for frag in remote_fragments: if frag['error'] is not None: raise frag['error'] if self.enabled: cf_cache = CloudFiles('file://' + self.path, progress=('to Cache' if progress else None)) cf_cache.puts(compression.transcode( (frag for frag in remote_fragments if frag['content'] is not None), encoding=compress, progress=progress, in_place=False), compress=compress, raw=True) remote_fragments_dict = {} while remote_fragments: res = remote_fragments.pop() remote_fragments_dict[res['path']] = compression.decompress( res['content'], res['compress']) fragments.update(remote_fragments_dict) return fragments
def download(self, paths, compress=None, progress=None): """ Download the provided paths, but grab them from cache first if they are present and the cache is enabled. Returns: { filename: content, ... } """ if len(paths) == 0: return {} progress = nvl(progress, self.config.progress) compress = nvl(compress, self.compress, self.config.compress) locs = self.compute_data_locations(paths) locs['remote'] = [str(x) for x in locs['remote']] fragments = {} if self.enabled: fragments = self.get(locs['local'], progress=progress) cf = CloudFiles(self.meta.cloudpath, progress=progress, secrets=self.config.secrets) remote_fragments = cf.get(locs['remote'], raw=True) for frag in remote_fragments: if frag['error'] is not None: raise frag['error'] if self.enabled: cf_cache = CloudFiles('file://' + self.path, progress=('to Cache' if progress else None)) cf_cache.puts(compression.transcode( (frag for frag in remote_fragments if frag['content'] is not None), encoding=compress, progress=progress, in_place=False), compress=compress, raw=True) remote_fragments = { res['path']: compression.decompress(res['content'], res['compress']) \ for res in remote_fragments } fragments.update(remote_fragments) return fragments
def _cp_single(ctx, source, destination, recursive, compression, progress, block_size): use_stdin = (source == '-') use_stdout = (destination == '-') if use_stdout: progress = False # can't have the progress bar interfering nsrc = normalize_path(source) ndest = normalize_path(destination) # For more information see: # https://cloud.google.com/storage/docs/gsutil/commands/cp#how-names-are-constructed # Try to follow cp rules. If the directory exists, # copy the base source directory into the dest directory # If the directory does not exist, then we copy into # the dest directory. # Both x* and x** should not copy the base directory if recursive and nsrc[-1] != "*": if CloudFiles(ndest).isdir(): if nsrc[-1] == '/': nsrc = nsrc[:-1] ndest = cloudpathjoin(ndest, os.path.basename(nsrc)) ctx.ensure_object(dict) parallel = int(ctx.obj.get("parallel", 1)) issrcdir = ispathdir(source) and use_stdin == False isdestdir = ispathdir(destination) srcpath = nsrc if issrcdir else os.path.dirname(nsrc) many, flat, prefix = get_mfp(nsrc, recursive) if issrcdir and not many: print(f"cloudfiles: {source} is a directory (not copied).") return xferpaths = os.path.basename(nsrc) if use_stdin: xferpaths = sys.stdin.readlines() xferpaths = [x.replace("\n", "") for x in xferpaths] prefix = os.path.commonprefix(xferpaths) xferpaths = [x.replace(prefix, "") for x in xferpaths] srcpath = cloudpathjoin(srcpath, prefix) elif many: xferpaths = CloudFiles(srcpath, green=True).list(prefix=prefix, flat=flat) destpath = ndest if isinstance(xferpaths, str): destpath = ndest if isdestdir else os.path.dirname(ndest) elif not isdestdir: if os.path.exists(ndest.replace("file://", "")): print(f"cloudfiles: {ndest} is not a directory (not copied).") return if compression == "same": compression = None elif compression == "none": compression = False if not isinstance(xferpaths, str): if parallel == 1: _cp(srcpath, destpath, compression, progress, block_size, xferpaths) return total = None try: total = len(xferpaths) except TypeError: pass if use_stdout: fn = partial(_cp_stdout, srcpath) else: fn = partial(_cp, srcpath, destpath, compression, False, block_size) with tqdm(desc="Transferring", total=total, disable=(not progress)) as pbar: with pathos.pools.ProcessPool(parallel) as executor: for _ in executor.imap(fn, sip(xferpaths, block_size)): pbar.update(block_size) else: cfsrc = CloudFiles(srcpath, green=True, progress=progress) if not cfsrc.exists(xferpaths): print( f"cloudfiles: source path not found: {cfsrc.abspath(xferpaths).replace('file://','')}" ) return if use_stdout: _cp_stdout(srcpath, xferpaths) return downloaded = cfsrc.get(xferpaths, raw=True) if compression is not None: downloaded = transcode(downloaded, compression, in_place=True) cfdest = CloudFiles(destpath, green=True, progress=progress) if isdestdir: cfdest.put(os.path.basename(nsrc), downloaded, raw=True) else: cfdest.put(os.path.basename(ndest), downloaded, raw=True)
def _cp_single(ctx, source, destination, recursive, compression, progress, block_size): use_stdin = (source == '-') nsrc = normalize_path(source) ndest = normalize_path(destination) ctx.ensure_object(dict) parallel = int(ctx.obj.get("parallel", 1)) issrcdir = ispathdir(source) and use_stdin == False isdestdir = ispathdir(destination) srcpath = nsrc if issrcdir else os.path.dirname(nsrc) many, flat, prefix = get_mfp(nsrc, recursive) if issrcdir and not many: print(f"cloudfiles: {source} is a directory (not copied).") return xferpaths = os.path.basename(nsrc) if use_stdin: xferpaths = sys.stdin.readlines() xferpaths = [x.replace("\n", "") for x in xferpaths] prefix = os.path.commonprefix(xferpaths) xferpaths = [x.replace(prefix, "") for x in xferpaths] srcpath = cloudpathjoin(srcpath, prefix) elif many: xferpaths = CloudFiles(srcpath, green=True).list(prefix=prefix, flat=flat) destpath = ndest if isinstance(xferpaths, str): destpath = ndest if isdestdir else os.path.dirname(ndest) elif not isdestdir: if os.path.exists(ndest.replace("file://", "")): print(f"cloudfiles: {ndest} is not a directory (not copied).") return if compression == "same": compression = None elif compression == "none": compression = False if not isinstance(xferpaths, str): if parallel == 1: _cp(srcpath, destpath, compression, progress, block_size, xferpaths) return total = None try: total = len(xferpaths) except TypeError: pass fn = partial(_cp, srcpath, destpath, compression, False, block_size) with tqdm(desc="Transferring", total=total, disable=(not progress)) as pbar: with pathos.pools.ProcessPool(parallel) as executor: for _ in executor.imap(fn, sip(xferpaths, block_size)): pbar.update(block_size) else: cfsrc = CloudFiles(srcpath, green=True, progress=progress) if not cfsrc.exists(xferpaths): print( f"cloudfiles: source path not found: {cfsrc.abspath(xferpaths).replace('file://','')}" ) return downloaded = cfsrc.get(xferpaths, raw=True) if compression is not None: downloaded = transcode(downloaded, compression, in_place=True) cfdest = CloudFiles(destpath, green=True, progress=progress) if isdestdir: cfdest.put(os.path.basename(nsrc), downloaded, raw=True) else: cfdest.put(os.path.basename(ndest), downloaded, raw=True)
def transfer_to(self, cloudpath, bbox, mip, block_size=None, compress=True, compress_level=None): """ Transfer files from one storage location to another, bypassing volume painting. This enables using a single CloudVolume instance to transfer big volumes. In some cases, gsutil or aws s3 cli tools may be more appropriate. This method is provided for convenience. It may be optimized for better performance over time as demand requires. cloudpath (str): path to storage layer bbox (Bbox object): ROI to transfer mip (int): resolution level block_size (int): number of file chunks to transfer per I/O batch. compress (bool): Set to False to upload as uncompressed """ from cloudvolume import CloudVolume if mip is None: mip = self.config.mip if self.is_sharded(mip): raise exceptions.UnsupportedFormatError( f"Sharded sources are not supported. got: {self.meta.cloudpath}" ) bbox = Bbox.create(bbox, self.meta.bounds(mip)) realized_bbox = bbox.expand_to_chunk_size( self.meta.chunk_size(mip), offset=self.meta.voxel_offset(mip)) realized_bbox = Bbox.clamp(realized_bbox, self.meta.bounds(mip)) if bbox != realized_bbox: raise exceptions.AlignmentError( "Unable to transfer non-chunk aligned bounding boxes. Requested: {}, Realized: {}" .format(bbox, realized_bbox)) default_block_size_MB = 50 # MB chunk_MB = self.meta.chunk_size(mip).rectVolume() * np.dtype( self.meta.dtype).itemsize * self.meta.num_channels if self.meta.layer_type == 'image': # kind of an average guess for some EM datasets, have seen up to 1.9x and as low as 1.1 # affinites are also images, but have very different compression ratios. e.g. 3x for kempressed chunk_MB /= 1.3 else: # segmentation chunk_MB /= 100.0 # compression ratios between 80 and 800.... chunk_MB /= 1024.0 * 1024.0 if block_size: step = block_size else: step = int(default_block_size_MB // chunk_MB) + 1 try: destvol = CloudVolume(cloudpath, mip=mip) except exceptions.InfoUnavailableError: destvol = CloudVolume(cloudpath, mip=mip, info=self.meta.info, provenance=self.meta.provenance.serialize()) destvol.commit_info() destvol.commit_provenance() except exceptions.ScaleUnavailableError: destvol = CloudVolume(cloudpath) for i in range(len(destvol.scales) + 1, len(self.meta.scales)): destvol.scales.append(self.meta.scales[i]) destvol.commit_info() destvol.commit_provenance() if destvol.image.is_sharded(mip): raise exceptions.UnsupportedFormatError( f"Sharded destinations are not supported. got: {destvol.cloudpath}" ) num_blocks = np.ceil( self.meta.bounds(mip).volume() / self.meta.chunk_size(mip).rectVolume()) / step num_blocks = int(np.ceil(num_blocks)) cloudpaths = chunknames(bbox, self.meta.bounds(mip), self.meta.key(mip), self.meta.chunk_size(mip), protocol=self.meta.path.protocol) pbar = tqdm( desc='Transferring Blocks of {} Chunks'.format(step), unit='blocks', disable=(not self.config.progress), total=num_blocks, ) cfsrc = CloudFiles(self.meta.cloudpath, secrets=self.config.secrets) cfdest = CloudFiles(cloudpath) def check(files): errors = [ file for file in files if \ (file['content'] is None or file['error'] is not None) ] if errors: error_paths = [f['path'] for f in errors] raise exceptions.EmptyFileException( "{} were empty or had IO errors.".format( ", ".join(error_paths))) return files with pbar: for srcpaths in sip(cloudpaths, step): files = check(cfsrc.get(srcpaths, raw=True)) cfdest.puts(compression.transcode(files, encoding=compress, level=compress_level, in_place=True), compress=compress, content_type=tx.content_type(destvol), raw=True) pbar.update()