def get_mesh_filenames_subset( cloudpath:str, mesh_dir:str, prefix:str ): prefix = f'{mesh_dir}/{prefix}' segids = defaultdict(list) cf = CloudFiles(cloudpath) meshexpr = re.compile(r'(\d+):(\d+):') for filename in cf.list(prefix=prefix): filename = os.path.basename(filename) # `match` implies the beginning (^). `search` matches whole string matches = re.search(meshexpr, filename) if not matches: continue segid, lod = matches.groups() segid, lod = int(segid), int(lod) if lod != 0: continue segids[segid].append(filename) return segids
def get_all_skeleton_ids(self): cf = CloudFiles(self.src_path) skeleton_filenames = cf.list(flat=True) skeleton_ids = [] for skeleton_filename in skeleton_filenames: if ":" in skeleton_filename: # Fragment continue skeleton_ids.append(int(skeleton_filename)) return skeleton_ids
class MeshManifestOperator(OperatorBase): """Create mesh manifest files for Neuroglancer visualization.""" def __init__(self, volume_path: str, lod: int = 0, name: str = 'mesh-manifest'): """ Parameters ------------ volume_path: path to store mesh manifest files lod: level of detail. we always use 0! """ super().__init__(name=name) self.lod = lod vol = CloudVolume(volume_path) info = vol.info assert 'mesh' in info self.mesh_path = os.path.join(volume_path, info['mesh']) self.storage = CloudFiles(self.mesh_path) def __call__(self, prefix: Union[int, str], digits: int) -> None: assert int(prefix) < 10**digits prefix = str(prefix).zfill(digits) id2filenames = defaultdict(list) for filename in tqdm(self.storage.list(prefix=prefix), desc='list mesh files'): filename = os.path.basename(filename) # `match` implies the beginning (^). `search` matches whole string matches = re.search(r'(\d+):(\d+):', filename) if not matches: continue seg_id, lod = matches.groups() seg_id, lod = int(seg_id), int(lod) # currently we are not using `level of detail`, it is always 0 # will need to adjust code if we start using variants assert lod == self.lod id2filenames[seg_id].append(filename) for seg_id, frags in tqdm(id2filenames.items(), desc='upload aggregated manifest file'): logging.info(f'segment id: {seg_id}') logging.info(f'fragments: {frags}') self.storage.put_json( path=f'{seg_id}:{self.lod}', content={"fragments": frags}, ) # the last few hundred files will not be uploaded without sleeping! sleep(0.01)
def TransferMeshFilesTask(src: str, dest: str, prefix: str, mesh_dir: Optional[str] = None): cv_src = CloudVolume(src) cv_dest = CloudVolume(dest, mesh_dir=mesh_dir) cf_src = CloudFiles(cv_src.mesh.meta.layerpath) cf_dest = CloudFiles(cv_dest.mesh.meta.layerpath) cf_src.transfer_to(cf_dest, paths=cf_src.list(prefix=prefix))
def test_get_generator(num_threads, green): from cloudfiles import CloudFiles, exceptions path = '/tmp/cloudfiles/gen' rmtree(path) url = 'file://' + path cf = CloudFiles(url, num_threads=num_threads, green=green) gen = ( (str(i), b'hello world') for i in range(100) ) cf.puts(gen) files = cf.get(( str(i) for i in range(100) ), total=100) assert all([ f['error'] is None for f in files ]) assert len(files) == 100 assert all([ f['content'] == b'hello world' for f in files ]) fnames = [ str(i) for i in range(100) ] assert sorted(list(cf.list())) == sorted(fnames) cf.delete(( str(i) for i in range(100) )) assert list(cf.list()) == []
def test_cli_rm_python(s3, protocol): from cloudfiles_cli.cloudfiles_cli import _rm from cloudfiles import CloudFiles, exceptions test_dir = compute_url(protocol, "cli_rm_python") cf = CloudFiles(test_dir) N = 100 def mkfiles(): cf.delete(cf.list()) for i in range(N): cf[str(i)] = b"hello world" def run_rm(path, recursive=False): _rm( path, recursive=recursive, progress=False, parallel=1, block_size=128 ) mkfiles() run_rm(test_dir, recursive=True) assert list(cf) == [] mkfiles() run_rm(test_dir, recursive=False) assert len(list(cf)) == N mkfiles() run_rm(test_dir + "/*") print(list(cf)) assert list(cf) == [] mkfiles() run_rm(test_dir + "/**") assert list(cf) == [] mkfiles() run_rm(test_dir + "/0") assert set(list(cf)) == set([ str(_) for _ in range(1, N) ]) mkfiles() run_rm(test_dir + "/1*") res = set([ str(_) for _ in range(N) ]) res.remove("1") for x in range(10, 20): res.remove(str(x)) assert set(list(cf)) == res cf.delete(cf.list())
def ls(shortpath, flat, expr, cloudpath): """Recursively lists the contents of a directory.""" cloudpath = normalize_path(cloudpath) _, flt, prefix = get_mfp(cloudpath, True) epath = extract(cloudpath) if len(epath.path) > 0: if prefix == "" and flt == False: prefix = os.path.basename(cloudpath) cloudpath = os.path.dirname(cloudpath) flat = flat or flt cf = CloudFiles(cloudpath, green=True) iterables = [] if expr: # TODO: make this a reality using a parser # match "[abc]{2}" or "[123]" meaning generate a 2 character cartesian # product of a,b, and c or a 1 character cartesian product of 1,2,3 # e.g. aa, ab, ac, ba, bb, bc, ca, cb, cc # 1, 2, 3 matches = re.findall(r'\[([a-zA-Z0-9]+)\]', prefix) if len(matches): iterables.extend([ cf.list(prefix=pfx, flat=flat) for pfx in exprgen(prefix, matches) ]) else: iterables.append(cf.list(flat=flat)) else: iterables = [cf.list(prefix=prefix, flat=flat)] iterables = itertools.chain(*iterables) for pathset in sip(iterables, 1000): if not shortpath: pathset = [cloudpathjoin(cloudpath, pth) for pth in pathset] print("\n".join(pathset))
def load_images(p: str, extension: str = "tif") -> dict: """Assume directory contains only the images to be stored""" files = CloudFiles(p) names = [] for f in sorted(files.list()): if extension in f: names.append(f) files.get(names, raw=True) files_bytes = [files[k] for k in names] imgs = [] for f in files_bytes: imgs.append(_load_image(f)) return {"seg": np.asarray(imgs).transpose(2, 1, 0)}
def du(paths, grand_total, summarize, human_readable): """Display disk usage statistics.""" results = [] for path in paths: npath = normalize_path(path) if ispathdir(path): cf = CloudFiles(npath, green=True) results.append(cf.size(cf.list())) else: cf = CloudFiles(os.path.dirname(npath), green=True) sz = cf.size(os.path.basename(npath)) if sz is None: print(f"cloudfiles: du: {path} does not exist") return results.append({path: sz}) def SI(val): if not human_readable: return val if val < 1024: return f"{val} Bytes" elif val < 2**20: return f"{(val / 2**10):.2f} KiB" elif val < 2**30: return f"{(val / 2**20):.2f} MiB" elif val < 2**40: return f"{(val / 2**30):.2f} GiB" elif val < 2**50: return f"{(val / 2**40):.2f} TiB" elif val < 2**60: return f"{(val / 2**50):.2f} PiB" else: return f"{(val / 2**60):.2f} EiB" summary = {} for path, res in zip(paths, results): summary[path] = sum(res.values()) if summarize: print(f"{SI(summary[path])}\t{path}") if not summarize: for res in results: for pth, size in res.items(): print(f"{SI(size)}\t{pth}") if grand_total: print(f"{SI(sum(summary.values()))}\ttotal")
def has_data(self, mip=None): """ Returns whether the specified mip appears to have data by testing whether the "folder" exists. Returns: bool The mip is the index into the returned list. If the entry is True, then the data appears to be there. If the entry is False, then the data is not there. """ mip = mip if mip is not None else self.config.mip mip = self.meta.to_mip(mip) cf = CloudFiles(self.meta.cloudpath, secrets=self.config.secrets) key = self.meta.key(mip) return first(cf.list(prefix=key)) is not None
def MeshManifestPrefixTask(layer_path: str, prefix: str, lod: int = 0, mesh_dir: Optional[str] = None): """ Finalize mesh generation by post-processing chunk fragment lists into mesh fragment manifests. These are necessary for neuroglancer to know which mesh fragments to download for a given segid. If we parallelize using prefixes single digit prefixes ['0','1',..'9'] all meshes will be correctly processed. But if we do ['10','11',..'99'] meshes from [0,9] won't get processed and need to be handle specifically by creating tasks that will process a single mesh ['0:','1:',..'9:'] """ cf = CloudFiles(layer_path) info = cf.get_json('info') if mesh_dir is None and 'mesh' in info: mesh_dir = info['mesh'] prefix = cf.join(mesh_dir, prefix) segids = defaultdict(list) regexp = re.compile(r'(\d+):(\d+):') for filename in cf.list(prefix=prefix): filename = os.path.basename(filename) # `match` implies the beginning (^). `search` matches whole string matches = re.search(regexp, filename) if not matches: continue segid, mlod = matches.groups() segid, mlod = int(segid), int(mlod) if mlod != lod: continue segids[segid].append(filename) items = ((f"{mesh_dir}/{segid}:{lod}", { "fragments": frags }) for segid, frags in segids.items()) cf.put_jsons(items)
def dsbounds(path, mip): """ Detects the volume bounds and chunk size for an unsharded image volume. Useful when there is a corrupted info file. """ path = cloudfiles.paths.normalize(path) cv = CloudVolume(path, mip=mip) cf = CloudFiles(path) bboxes = [] for filename in tqdm(cf.list(prefix=cv.key), desc="Computing Bounds"): bboxes.append(Bbox.from_filename(filename)) bounds = Bbox.expand(*bboxes) chunk_size = list(reduce(max2, map(lambda bbox: bbox.size3(), bboxes))) print(f"Bounds: {bounds}") print(f"Volume: {list(bounds.size3())}") print(f"Chunks: {chunk_size}")
class MergeSkeletonTask(scheduling.Task): def __init__(self, dst_path, mip, dust_threshold, tick_threshold, prefix=""): super().__init__(self) self.dst_path = dst_path self.cf = CloudFiles(self.dst_path) self.mip = mip self.dust_threshold = dust_threshold self.tick_threshold = tick_threshold self.prefix = prefix def execute(self): corgie_logger.info(f"Merging skeletons at {self.dst_path}") fragment_filenames = self.cf.list(prefix=self.prefix, flat=True) skeleton_files = self.cf.get(fragment_filenames) skeletons = defaultdict(list) for skeleton_file in skeleton_files: try: colon_index = skeleton_file["path"].index(":") except ValueError: # File is full skeleton, not fragment continue seg_id = skeleton_file["path"][0:colon_index] skeleton_fragment = pickle.loads(skeleton_file["content"]) if not skeleton_fragment.empty(): skeletons[seg_id].append(skeleton_fragment) for seg_id, skeleton_fragments in skeletons.items(): skeleton = PrecomputedSkeleton.simple_merge( skeleton_fragments).consolidate() skeleton = kimimaro.postprocess(skeleton, self.dust_threshold, self.tick_threshold) skeleton.id = int(seg_id) self.cf.put(path=seg_id, content=skeleton.to_precomputed(), compress="gzip") corgie_logger.info(f"Finished skeleton {seg_id}")
def head(paths): results = {} for path in paths: npath = normalize_path(path) npath = re.sub(r'\*+$', '', path) many, flat, prefix = get_mfp(path, False) if many: cf = CloudFiles(npath, green=True) res = cf.head(cf.list(prefix=prefix, flat=flat)) results.update(res) else: cf = CloudFiles(os.path.dirname(npath), green=True) results[path] = cf.head(os.path.basename(npath)) pp = pprint.PrettyPrinter(indent=2) if len(paths) == 1 and len(results) == 1: val = first(results.values()) if val is not None: print(val) else: print("cloudfiles: head: File not found: {}".format(paths[0])) elif len(paths) > 0: pp.pprint(results)
def test_list(s3, protocol): from cloudfiles import CloudFiles, exceptions url = compute_url(protocol, "list") cf = CloudFiles(url, num_threads=5) content = b'some_string' cf.put('info1', content, compress=None) cf.put('info2', content, compress=None) cf.put('build/info3', content, compress=None) cf.put('level1/level2/info4', content, compress=None) cf.put('info5', content, compress='gzip') cf.put('info.txt', content, compress=None) # time.sleep(1) # sometimes it takes a moment for google to update the list assert set(cf.list(prefix='')) == set(['build/info3','info1', 'info2', 'level1/level2/info4', 'info5', 'info.txt']) assert set(list(cf)) == set(cf.list(prefix='')) assert set(cf.list(prefix='inf')) == set(['info1','info2','info5','info.txt']) assert set(cf.list(prefix='info1')) == set(['info1']) assert set(cf.list(prefix='build')) == set(['build/info3']) assert set(cf.list(prefix='build/')) == set(['build/info3']) assert set(cf.list(prefix='level1/')) == set(['level1/level2/info4']) assert set(cf.list(prefix='nofolder/')) == set([]) # Tests (1) assert set(cf.list(prefix='', flat=True)) == set(['info1','info2','info5','info.txt']) assert set(cf.list(prefix='inf', flat=True)) == set(['info1','info2','info5','info.txt']) # Tests (2) assert set(cf.list(prefix='build', flat=True)) == set([]) # Tests (3) assert set(cf.list(prefix='level1/', flat=True)) == set([]) assert set(cf.list(prefix='build/', flat=True)) == set(['build/info3']) # Tests (4) assert set(cf.list(prefix='build/inf', flat=True)) == set(['build/info3']) for file_path in ('info1', 'info2', 'build/info3', 'level1/level2/info4', 'info5', 'info.txt'): cf.delete(file_path) if protocol == 'file': rmtree("/tmp/cloudfiles/list")
def create_sharded_multires_mesh_from_unsharded_tasks( src:str, dest:str, shard_index_bytes=2**13, minishard_index_bytes=2**15, min_shards:int = 1, num_lod:int = 1, draco_compression_level:int = 1, vertex_quantization_bits:int = 16, minishard_index_encoding="gzip", mesh_dir:Optional[str] = None, ) -> Iterator[MultiResShardedMeshMergeTask]: configure_multires_info( dest, vertex_quantization_bits, mesh_dir ) cv_src = CloudVolume(src) cf = CloudFiles(cv_src.mesh.meta.layerpath) all_labels = [] SEGID_RE = re.compile(r'(\d+):0(?:\.gz|\.br|\.zstd)?$') for path in cf.list(): match = SEGID_RE.search(path) if match is None: continue (segid,) = match.groups() all_labels.append(int(segid)) (shard_bits, minishard_bits, preshift_bits) = \ compute_shard_params_for_hashed( num_labels=len(all_labels), shard_index_bytes=int(shard_index_bytes), minishard_index_bytes=int(minishard_index_bytes), min_shards=int(min_shards), ) cv_dest = CloudVolume(dest, mesh_dir=mesh_dir) cv_dest.mesh.meta.info["mip"] = cv_src.mesh.meta.mip cv_dest.commit_info() spec = ShardingSpecification( type='neuroglancer_uint64_sharded_v1', preshift_bits=preshift_bits, hash='murmurhash3_x86_128', minishard_bits=minishard_bits, shard_bits=shard_bits, minishard_index_encoding=minishard_index_encoding, data_encoding="raw", # draco encoded meshes ) cv_dest.mesh.meta.info['sharding'] = spec.to_dict() cv_dest.mesh.meta.commit_info() cv_dest = CloudVolume(dest, mesh_dir=mesh_dir) # perf: ~66.5k hashes/sec on M1 ARM64 shardfn = lambda lbl: cv_dest.mesh.reader.spec.compute_shard_location(lbl).shard_number shard_labels = defaultdict(list) for label in tqdm(all_labels, desc="Hashes"): shard_labels[shardfn(label)].append(label) del all_labels cf = CloudFiles(cv_dest.mesh.meta.layerpath, progress=True) files = ( (str(shardno) + '.labels', labels) for shardno, labels in shard_labels.items() ) cf.put_jsons( files, compress="gzip", cache_control="no-cache", total=len(shard_labels) ) cv_dest.provenance.processing.append({ 'method': { 'task': 'MultiResShardedFromUnshardedMeshMergeTask', 'src': src, 'dest': dest, 'num_lod': num_lod, 'vertex_quantization_bits': vertex_quantization_bits, 'preshift_bits': preshift_bits, 'minishard_bits': minishard_bits, 'shard_bits': shard_bits, 'mesh_dir': mesh_dir, 'draco_compression_level': draco_compression_level, }, 'by': operator_contact(), 'date': strftime('%Y-%m-%d %H:%M %Z'), }) cv_dest.commit_provenance() return [ partial(MultiResShardedFromUnshardedMeshMergeTask, src=src, dest=dest, shard_no=shard_no, num_lod=num_lod, mesh_dir=mesh_dir, draco_compression_level=draco_compression_level, ) for shard_no in shard_labels.keys() ]
def DeleteSkeletonFilesTask(cloudpath: str, prefix: str, skel_dir: Optional[str] = None): cv = CloudVolume(cloudpath, skel_dir=skel_dir) cf = CloudFiles(cv.skeleton.meta.layerpath) cf.delete(cf.list(prefix=prefix))
def create_sharded_skeletons_from_unsharded_tasks( src: str, dest: str, shard_index_bytes=2**13, minishard_index_bytes=2**15, min_shards: int = 1, minishard_index_encoding='gzip', data_encoding='gzip', skel_dir: Optional[str] = None, ) -> Iterator[ShardedFromUnshardedSkeletonMergeTask]: cv_src = CloudVolume(src) cv_src.mip = cv_src.skeleton.meta.mip cf = CloudFiles(cv_src.skeleton.meta.layerpath) all_labels = [] SEGID_RE = re.compile(r'(\d+)(?:\.gz|\.br|\.zstd)?$') for path in cf.list(): match = SEGID_RE.search(path) if match is None: continue (segid, ) = match.groups() all_labels.append(int(segid)) cv_dest = CloudVolume(dest, skel_dir=skel_dir) cv_dest.skeleton.meta.info = copy.deepcopy(cv_src.skeleton.meta.info) cv_dest.skeleton.meta.info["vertex_attributes"] = [ attr for attr in cv_dest.skeleton.meta.info["vertex_attributes"] if attr["data_type"] in ("float32", "float64") ] (shard_bits, minishard_bits, preshift_bits) = \ compute_shard_params_for_hashed( num_labels=len(all_labels), shard_index_bytes=int(shard_index_bytes), minishard_index_bytes=int(minishard_index_bytes), min_shards=int(min_shards), ) spec = ShardingSpecification( type='neuroglancer_uint64_sharded_v1', preshift_bits=preshift_bits, hash='murmurhash3_x86_128', minishard_bits=minishard_bits, shard_bits=shard_bits, minishard_index_encoding=minishard_index_encoding, data_encoding=data_encoding, ) cv_dest.skeleton.meta.info['sharding'] = spec.to_dict() cv_dest.skeleton.meta.commit_info() cv_dest = CloudVolume(dest, skel_dir=skel_dir) # perf: ~66.5k hashes/sec on M1 ARM64 shardfn = lambda lbl: cv_dest.skeleton.reader.spec.compute_shard_location( lbl).shard_number shard_labels = defaultdict(list) for label in tqdm(all_labels, desc="Hashes"): shard_labels[shardfn(label)].append(label) del all_labels cf = CloudFiles(cv_dest.skeleton.meta.layerpath, progress=True) files = ((str(shardno) + '.labels', labels) for shardno, labels in shard_labels.items()) cf.put_jsons(files, compress="gzip", cache_control="no-cache", total=len(shard_labels)) cv_dest.provenance.processing.append({ 'method': { 'task': 'ShardedFromUnshardedSkeletonMergeTask', 'src': src, 'dest': dest, 'preshift_bits': preshift_bits, 'minishard_bits': minishard_bits, 'shard_bits': shard_bits, 'skel_dir': skel_dir, }, 'by': operator_contact(), 'date': strftime('%Y-%m-%d %H:%M %Z'), }) cv_dest.commit_provenance() return [ partial( ShardedFromUnshardedSkeletonMergeTask, src=src, dest=dest, shard_no=shard_no, skel_dir=skel_dir, ) for shard_no in shard_labels.keys() ]
def DeleteMeshFilesTask(cloudpath: str, prefix: str, mesh_dir: Optional[str] = None): cv = CloudVolume(cloudpath, mesh_dir=mesh_dir) cf = CloudFiles(cv.mesh.meta.layerpath) cf.delete(cf.list(prefix=prefix))
def get_filenames(self): prefix = '{}/{}'.format(self.vol.skeleton.path, self.prefix) cf = CloudFiles(self.cloudpath, progress=True) return [_ for _ in cf.list(prefix=prefix)]