Ejemplo n.º 1
0
def get_mesh_filenames_subset(
  cloudpath:str, mesh_dir:str, prefix:str
):
  prefix = f'{mesh_dir}/{prefix}'
  segids = defaultdict(list)

  cf = CloudFiles(cloudpath)
  meshexpr = re.compile(r'(\d+):(\d+):')
  for filename in cf.list(prefix=prefix):
    filename = os.path.basename(filename)
    # `match` implies the beginning (^). `search` matches whole string
    matches = re.search(meshexpr, filename)

    if not matches:
      continue

    segid, lod = matches.groups()
    segid, lod = int(segid), int(lod)

    if lod != 0:
      continue

    segids[segid].append(filename)

  return segids
 def get_all_skeleton_ids(self):
     cf = CloudFiles(self.src_path)
     skeleton_filenames = cf.list(flat=True)
     skeleton_ids = []
     for skeleton_filename in skeleton_filenames:
         if ":" in skeleton_filename:
             # Fragment
             continue
         skeleton_ids.append(int(skeleton_filename))
     return skeleton_ids
Ejemplo n.º 3
0
class MeshManifestOperator(OperatorBase):
    """Create mesh manifest files for Neuroglancer visualization."""
    def __init__(self,
                 volume_path: str,
                 lod: int = 0,
                 name: str = 'mesh-manifest'):
        """
        Parameters
        ------------
        volume_path: 
            path to store mesh manifest files
        lod:
            level of detail. we always use 0!
        """
        super().__init__(name=name)
        self.lod = lod
        vol = CloudVolume(volume_path)
        info = vol.info
        assert 'mesh' in info
        self.mesh_path = os.path.join(volume_path, info['mesh'])
        self.storage = CloudFiles(self.mesh_path)

    def __call__(self, prefix: Union[int, str], digits: int) -> None:
        assert int(prefix) < 10**digits
        prefix = str(prefix).zfill(digits)

        id2filenames = defaultdict(list)
        for filename in tqdm(self.storage.list(prefix=prefix),
                             desc='list mesh files'):

            filename = os.path.basename(filename)
            # `match` implies the beginning (^). `search` matches whole string
            matches = re.search(r'(\d+):(\d+):', filename)

            if not matches:
                continue

            seg_id, lod = matches.groups()
            seg_id, lod = int(seg_id), int(lod)
            # currently we are not using `level of detail`, it is always 0
            # will need to adjust code if we start using variants
            assert lod == self.lod
            id2filenames[seg_id].append(filename)

        for seg_id, frags in tqdm(id2filenames.items(),
                                  desc='upload aggregated manifest file'):

            logging.info(f'segment id: {seg_id}')
            logging.info(f'fragments: {frags}')
            self.storage.put_json(
                path=f'{seg_id}:{self.lod}',
                content={"fragments": frags},
            )
            # the last few hundred files will not be uploaded without sleeping!
            sleep(0.01)
Ejemplo n.º 4
0
def TransferMeshFilesTask(src: str,
                          dest: str,
                          prefix: str,
                          mesh_dir: Optional[str] = None):
    cv_src = CloudVolume(src)
    cv_dest = CloudVolume(dest, mesh_dir=mesh_dir)

    cf_src = CloudFiles(cv_src.mesh.meta.layerpath)
    cf_dest = CloudFiles(cv_dest.mesh.meta.layerpath)

    cf_src.transfer_to(cf_dest, paths=cf_src.list(prefix=prefix))
Ejemplo n.º 5
0
def test_get_generator(num_threads, green):
  from cloudfiles import CloudFiles, exceptions
  path = '/tmp/cloudfiles/gen'
  rmtree(path)
  url = 'file://' + path

  cf = CloudFiles(url, num_threads=num_threads, green=green)

  gen = ( (str(i), b'hello world') for i in range(100) )
  cf.puts(gen)

  files = cf.get(( str(i) for i in range(100) ), total=100)

  assert all([ f['error'] is None for f in files ])
  assert len(files) == 100
  assert all([ f['content'] == b'hello world' for f in files ])

  fnames = [ str(i) for i in range(100) ]
  assert sorted(list(cf.list())) == sorted(fnames)

  cf.delete(( str(i) for i in range(100) ))
  assert list(cf.list()) == []
Ejemplo n.º 6
0
def test_cli_rm_python(s3, protocol):
  from cloudfiles_cli.cloudfiles_cli import _rm
  from cloudfiles import CloudFiles, exceptions

  test_dir = compute_url(protocol, "cli_rm_python")
  cf = CloudFiles(test_dir)

  N = 100

  def mkfiles():
    cf.delete(cf.list())
    for i in range(N):
      cf[str(i)] = b"hello world"

  def run_rm(path, recursive=False):
    _rm(
      path, recursive=recursive, progress=False, 
      parallel=1, block_size=128
    )


  mkfiles()
  run_rm(test_dir, recursive=True)
  assert list(cf) == []

  mkfiles()
  run_rm(test_dir, recursive=False)
  assert len(list(cf)) == N

  mkfiles()
  run_rm(test_dir + "/*")
  print(list(cf))
  assert list(cf) == []

  mkfiles()
  run_rm(test_dir + "/**")
  assert list(cf) == []

  mkfiles()
  run_rm(test_dir + "/0")
  assert set(list(cf)) == set([ str(_) for _ in range(1, N) ])

  mkfiles()
  run_rm(test_dir + "/1*")
  res = set([ str(_) for _ in range(N) ])
  res.remove("1")
  for x in range(10, 20):
    res.remove(str(x))
  assert set(list(cf)) == res

  cf.delete(cf.list())
Ejemplo n.º 7
0
def ls(shortpath, flat, expr, cloudpath):
    """Recursively lists the contents of a directory."""
    cloudpath = normalize_path(cloudpath)

    _, flt, prefix = get_mfp(cloudpath, True)
    epath = extract(cloudpath)
    if len(epath.path) > 0:
        if prefix == "" and flt == False:
            prefix = os.path.basename(cloudpath)
        cloudpath = os.path.dirname(cloudpath)

    flat = flat or flt

    cf = CloudFiles(cloudpath, green=True)
    iterables = []
    if expr:
        # TODO: make this a reality using a parser
        # match "[abc]{2}" or "[123]" meaning generate a 2 character cartesian
        # product of a,b, and c or a 1 character cartesian product of 1,2,3
        # e.g. aa, ab, ac, ba, bb, bc, ca, cb, cc
        #      1, 2, 3
        matches = re.findall(r'\[([a-zA-Z0-9]+)\]', prefix)

        if len(matches):
            iterables.extend([
                cf.list(prefix=pfx, flat=flat)
                for pfx in exprgen(prefix, matches)
            ])
        else:
            iterables.append(cf.list(flat=flat))
    else:
        iterables = [cf.list(prefix=prefix, flat=flat)]

    iterables = itertools.chain(*iterables)
    for pathset in sip(iterables, 1000):
        if not shortpath:
            pathset = [cloudpathjoin(cloudpath, pth) for pth in pathset]
        print("\n".join(pathset))
Ejemplo n.º 8
0
def load_images(p: str, extension: str = "tif") -> dict:
    """Assume directory contains only the images to be stored"""
    files = CloudFiles(p)
    names = []
    for f in sorted(files.list()):
        if extension in f:
            names.append(f)

    files.get(names, raw=True)
    files_bytes = [files[k] for k in names]

    imgs = []
    for f in files_bytes:
        imgs.append(_load_image(f))
    return {"seg": np.asarray(imgs).transpose(2, 1, 0)}
Ejemplo n.º 9
0
def du(paths, grand_total, summarize, human_readable):
    """Display disk usage statistics."""
    results = []
    for path in paths:
        npath = normalize_path(path)
        if ispathdir(path):
            cf = CloudFiles(npath, green=True)
            results.append(cf.size(cf.list()))
        else:
            cf = CloudFiles(os.path.dirname(npath), green=True)
            sz = cf.size(os.path.basename(npath))
            if sz is None:
                print(f"cloudfiles: du: {path} does not exist")
                return
            results.append({path: sz})

    def SI(val):
        if not human_readable:
            return val

        if val < 1024:
            return f"{val} Bytes"
        elif val < 2**20:
            return f"{(val / 2**10):.2f} KiB"
        elif val < 2**30:
            return f"{(val / 2**20):.2f} MiB"
        elif val < 2**40:
            return f"{(val / 2**30):.2f} GiB"
        elif val < 2**50:
            return f"{(val / 2**40):.2f} TiB"
        elif val < 2**60:
            return f"{(val / 2**50):.2f} PiB"
        else:
            return f"{(val / 2**60):.2f} EiB"

    summary = {}
    for path, res in zip(paths, results):
        summary[path] = sum(res.values())
        if summarize:
            print(f"{SI(summary[path])}\t{path}")

    if not summarize:
        for res in results:
            for pth, size in res.items():
                print(f"{SI(size)}\t{pth}")

    if grand_total:
        print(f"{SI(sum(summary.values()))}\ttotal")
Ejemplo n.º 10
0
    def has_data(self, mip=None):
        """
    Returns whether the specified mip appears to have data 
    by testing whether the "folder" exists.

    Returns: bool
  
    The mip is the index into the returned list. If
    the entry is True, then the data appears to be there.
    If the entry is False, then the data is not there.
    """
        mip = mip if mip is not None else self.config.mip
        mip = self.meta.to_mip(mip)

        cf = CloudFiles(self.meta.cloudpath, secrets=self.config.secrets)
        key = self.meta.key(mip)
        return first(cf.list(prefix=key)) is not None
Ejemplo n.º 11
0
def MeshManifestPrefixTask(layer_path: str,
                           prefix: str,
                           lod: int = 0,
                           mesh_dir: Optional[str] = None):
    """
  Finalize mesh generation by post-processing chunk fragment
  lists into mesh fragment manifests.
  These are necessary for neuroglancer to know which mesh
  fragments to download for a given segid.

  If we parallelize using prefixes single digit prefixes ['0','1',..'9'] all meshes will
  be correctly processed. But if we do ['10','11',..'99'] meshes from [0,9] won't get
  processed and need to be handle specifically by creating tasks that will process
  a single mesh ['0:','1:',..'9:']
  """
    cf = CloudFiles(layer_path)
    info = cf.get_json('info')

    if mesh_dir is None and 'mesh' in info:
        mesh_dir = info['mesh']

    prefix = cf.join(mesh_dir, prefix)
    segids = defaultdict(list)

    regexp = re.compile(r'(\d+):(\d+):')
    for filename in cf.list(prefix=prefix):
        filename = os.path.basename(filename)
        # `match` implies the beginning (^). `search` matches whole string
        matches = re.search(regexp, filename)

        if not matches:
            continue

        segid, mlod = matches.groups()
        segid, mlod = int(segid), int(mlod)

        if mlod != lod:
            continue

        segids[segid].append(filename)

    items = ((f"{mesh_dir}/{segid}:{lod}", {
        "fragments": frags
    }) for segid, frags in segids.items())

    cf.put_jsons(items)
Ejemplo n.º 12
0
def dsbounds(path, mip):
    """
  Detects the volume bounds and chunk size for
  an unsharded image volume. Useful when there
  is a corrupted info file.
  """
    path = cloudfiles.paths.normalize(path)
    cv = CloudVolume(path, mip=mip)
    cf = CloudFiles(path)

    bboxes = []
    for filename in tqdm(cf.list(prefix=cv.key), desc="Computing Bounds"):
        bboxes.append(Bbox.from_filename(filename))

    bounds = Bbox.expand(*bboxes)
    chunk_size = list(reduce(max2, map(lambda bbox: bbox.size3(), bboxes)))

    print(f"Bounds: {bounds}")
    print(f"Volume: {list(bounds.size3())}")
    print(f"Chunks: {chunk_size}")
Ejemplo n.º 13
0
class MergeSkeletonTask(scheduling.Task):
    def __init__(self,
                 dst_path,
                 mip,
                 dust_threshold,
                 tick_threshold,
                 prefix=""):
        super().__init__(self)
        self.dst_path = dst_path
        self.cf = CloudFiles(self.dst_path)
        self.mip = mip
        self.dust_threshold = dust_threshold
        self.tick_threshold = tick_threshold
        self.prefix = prefix

    def execute(self):
        corgie_logger.info(f"Merging skeletons at {self.dst_path}")
        fragment_filenames = self.cf.list(prefix=self.prefix, flat=True)
        skeleton_files = self.cf.get(fragment_filenames)
        skeletons = defaultdict(list)
        for skeleton_file in skeleton_files:
            try:
                colon_index = skeleton_file["path"].index(":")
            except ValueError:
                # File is full skeleton, not fragment
                continue
            seg_id = skeleton_file["path"][0:colon_index]
            skeleton_fragment = pickle.loads(skeleton_file["content"])
            if not skeleton_fragment.empty():
                skeletons[seg_id].append(skeleton_fragment)
        for seg_id, skeleton_fragments in skeletons.items():
            skeleton = PrecomputedSkeleton.simple_merge(
                skeleton_fragments).consolidate()
            skeleton = kimimaro.postprocess(skeleton, self.dust_threshold,
                                            self.tick_threshold)
            skeleton.id = int(seg_id)
            self.cf.put(path=seg_id,
                        content=skeleton.to_precomputed(),
                        compress="gzip")
            corgie_logger.info(f"Finished skeleton {seg_id}")
Ejemplo n.º 14
0
def head(paths):
    results = {}
    for path in paths:
        npath = normalize_path(path)
        npath = re.sub(r'\*+$', '', path)
        many, flat, prefix = get_mfp(path, False)
        if many:
            cf = CloudFiles(npath, green=True)
            res = cf.head(cf.list(prefix=prefix, flat=flat))
            results.update(res)
        else:
            cf = CloudFiles(os.path.dirname(npath), green=True)
            results[path] = cf.head(os.path.basename(npath))

    pp = pprint.PrettyPrinter(indent=2)

    if len(paths) == 1 and len(results) == 1:
        val = first(results.values())
        if val is not None:
            print(val)
        else:
            print("cloudfiles: head: File not found: {}".format(paths[0]))
    elif len(paths) > 0:
        pp.pprint(results)
Ejemplo n.º 15
0
def test_list(s3, protocol):  
  from cloudfiles import CloudFiles, exceptions
  url = compute_url(protocol, "list")

  cf = CloudFiles(url, num_threads=5)
  content = b'some_string'
  cf.put('info1', content, compress=None)
  cf.put('info2', content, compress=None)
  cf.put('build/info3', content, compress=None)
  cf.put('level1/level2/info4', content, compress=None)
  cf.put('info5', content, compress='gzip')
  cf.put('info.txt', content, compress=None)

  # time.sleep(1) # sometimes it takes a moment for google to update the list
  assert set(cf.list(prefix='')) == set(['build/info3','info1', 'info2', 'level1/level2/info4', 'info5', 'info.txt'])
  assert set(list(cf)) == set(cf.list(prefix=''))
  
  assert set(cf.list(prefix='inf')) == set(['info1','info2','info5','info.txt'])
  assert set(cf.list(prefix='info1')) == set(['info1'])
  assert set(cf.list(prefix='build')) == set(['build/info3'])
  assert set(cf.list(prefix='build/')) == set(['build/info3'])
  assert set(cf.list(prefix='level1/')) == set(['level1/level2/info4'])
  assert set(cf.list(prefix='nofolder/')) == set([])

  # Tests (1)
  assert set(cf.list(prefix='', flat=True)) == set(['info1','info2','info5','info.txt'])
  assert set(cf.list(prefix='inf', flat=True)) == set(['info1','info2','info5','info.txt'])
  # Tests (2)
  assert set(cf.list(prefix='build', flat=True)) == set([])
  # Tests (3)
  assert set(cf.list(prefix='level1/', flat=True)) == set([])
  assert set(cf.list(prefix='build/', flat=True)) == set(['build/info3'])
  # Tests (4)
  assert set(cf.list(prefix='build/inf', flat=True)) == set(['build/info3'])

  for file_path in ('info1', 'info2', 'build/info3', 'level1/level2/info4', 'info5', 'info.txt'):
    cf.delete(file_path)
  
  if protocol == 'file':
    rmtree("/tmp/cloudfiles/list")
Ejemplo n.º 16
0
def create_sharded_multires_mesh_from_unsharded_tasks(
  src:str, 
  dest:str,
  shard_index_bytes=2**13, 
  minishard_index_bytes=2**15,
  min_shards:int = 1,
  num_lod:int = 1, 
  draco_compression_level:int = 1,
  vertex_quantization_bits:int = 16,
  minishard_index_encoding="gzip", 
  mesh_dir:Optional[str] = None, 
) -> Iterator[MultiResShardedMeshMergeTask]: 
  
  configure_multires_info(
    dest, 
    vertex_quantization_bits, 
    mesh_dir
  )

  cv_src = CloudVolume(src)
  cf = CloudFiles(cv_src.mesh.meta.layerpath)

  all_labels = []
  SEGID_RE = re.compile(r'(\d+):0(?:\.gz|\.br|\.zstd)?$')
  for path in cf.list():
    match = SEGID_RE.search(path)
    if match is None:
      continue
    (segid,) = match.groups()
    all_labels.append(int(segid))

  (shard_bits, minishard_bits, preshift_bits) = \
    compute_shard_params_for_hashed(
      num_labels=len(all_labels),
      shard_index_bytes=int(shard_index_bytes),
      minishard_index_bytes=int(minishard_index_bytes),
      min_shards=int(min_shards),
    )

  cv_dest = CloudVolume(dest, mesh_dir=mesh_dir)
  cv_dest.mesh.meta.info["mip"] = cv_src.mesh.meta.mip
  cv_dest.commit_info()

  spec = ShardingSpecification(
    type='neuroglancer_uint64_sharded_v1',
    preshift_bits=preshift_bits,
    hash='murmurhash3_x86_128',
    minishard_bits=minishard_bits,
    shard_bits=shard_bits,
    minishard_index_encoding=minishard_index_encoding,
    data_encoding="raw", # draco encoded meshes
  )

  cv_dest.mesh.meta.info['sharding'] = spec.to_dict()
  cv_dest.mesh.meta.commit_info()

  cv_dest = CloudVolume(dest, mesh_dir=mesh_dir)

  # perf: ~66.5k hashes/sec on M1 ARM64
  shardfn = lambda lbl: cv_dest.mesh.reader.spec.compute_shard_location(lbl).shard_number

  shard_labels = defaultdict(list)
  for label in tqdm(all_labels, desc="Hashes"):
    shard_labels[shardfn(label)].append(label)
  del all_labels

  cf = CloudFiles(cv_dest.mesh.meta.layerpath, progress=True)
  files = ( 
    (str(shardno) + '.labels', labels) 
    for shardno, labels in shard_labels.items() 
  )
  cf.put_jsons(
    files, compress="gzip", 
    cache_control="no-cache", total=len(shard_labels)
  )

  cv_dest.provenance.processing.append({
    'method': {
      'task': 'MultiResShardedFromUnshardedMeshMergeTask',
      'src': src,
      'dest': dest,
      'num_lod': num_lod,
      'vertex_quantization_bits': vertex_quantization_bits,
      'preshift_bits': preshift_bits, 
      'minishard_bits': minishard_bits, 
      'shard_bits': shard_bits,
      'mesh_dir': mesh_dir,
      'draco_compression_level': draco_compression_level,
    },
    'by': operator_contact(),
    'date': strftime('%Y-%m-%d %H:%M %Z'),
  }) 
  cv_dest.commit_provenance()

  return [
    partial(MultiResShardedFromUnshardedMeshMergeTask,
      src=src, 
      dest=dest, 
      shard_no=shard_no, 
      num_lod=num_lod,
      mesh_dir=mesh_dir, 
      draco_compression_level=draco_compression_level,
    )
    for shard_no in shard_labels.keys()
  ]
Ejemplo n.º 17
0
def DeleteSkeletonFilesTask(cloudpath: str,
                            prefix: str,
                            skel_dir: Optional[str] = None):
    cv = CloudVolume(cloudpath, skel_dir=skel_dir)
    cf = CloudFiles(cv.skeleton.meta.layerpath)
    cf.delete(cf.list(prefix=prefix))
Ejemplo n.º 18
0
def create_sharded_skeletons_from_unsharded_tasks(
    src: str,
    dest: str,
    shard_index_bytes=2**13,
    minishard_index_bytes=2**15,
    min_shards: int = 1,
    minishard_index_encoding='gzip',
    data_encoding='gzip',
    skel_dir: Optional[str] = None,
) -> Iterator[ShardedFromUnshardedSkeletonMergeTask]:
    cv_src = CloudVolume(src)
    cv_src.mip = cv_src.skeleton.meta.mip

    cf = CloudFiles(cv_src.skeleton.meta.layerpath)

    all_labels = []
    SEGID_RE = re.compile(r'(\d+)(?:\.gz|\.br|\.zstd)?$')
    for path in cf.list():
        match = SEGID_RE.search(path)
        if match is None:
            continue
        (segid, ) = match.groups()
        all_labels.append(int(segid))

    cv_dest = CloudVolume(dest, skel_dir=skel_dir)
    cv_dest.skeleton.meta.info = copy.deepcopy(cv_src.skeleton.meta.info)
    cv_dest.skeleton.meta.info["vertex_attributes"] = [
        attr for attr in cv_dest.skeleton.meta.info["vertex_attributes"]
        if attr["data_type"] in ("float32", "float64")
    ]

    (shard_bits, minishard_bits, preshift_bits) = \
      compute_shard_params_for_hashed(
        num_labels=len(all_labels),
        shard_index_bytes=int(shard_index_bytes),
        minishard_index_bytes=int(minishard_index_bytes),
        min_shards=int(min_shards),
      )

    spec = ShardingSpecification(
        type='neuroglancer_uint64_sharded_v1',
        preshift_bits=preshift_bits,
        hash='murmurhash3_x86_128',
        minishard_bits=minishard_bits,
        shard_bits=shard_bits,
        minishard_index_encoding=minishard_index_encoding,
        data_encoding=data_encoding,
    )

    cv_dest.skeleton.meta.info['sharding'] = spec.to_dict()
    cv_dest.skeleton.meta.commit_info()

    cv_dest = CloudVolume(dest, skel_dir=skel_dir)

    # perf: ~66.5k hashes/sec on M1 ARM64
    shardfn = lambda lbl: cv_dest.skeleton.reader.spec.compute_shard_location(
        lbl).shard_number

    shard_labels = defaultdict(list)
    for label in tqdm(all_labels, desc="Hashes"):
        shard_labels[shardfn(label)].append(label)
    del all_labels

    cf = CloudFiles(cv_dest.skeleton.meta.layerpath, progress=True)
    files = ((str(shardno) + '.labels', labels)
             for shardno, labels in shard_labels.items())
    cf.put_jsons(files,
                 compress="gzip",
                 cache_control="no-cache",
                 total=len(shard_labels))

    cv_dest.provenance.processing.append({
        'method': {
            'task': 'ShardedFromUnshardedSkeletonMergeTask',
            'src': src,
            'dest': dest,
            'preshift_bits': preshift_bits,
            'minishard_bits': minishard_bits,
            'shard_bits': shard_bits,
            'skel_dir': skel_dir,
        },
        'by': operator_contact(),
        'date': strftime('%Y-%m-%d %H:%M %Z'),
    })
    cv_dest.commit_provenance()

    return [
        partial(
            ShardedFromUnshardedSkeletonMergeTask,
            src=src,
            dest=dest,
            shard_no=shard_no,
            skel_dir=skel_dir,
        ) for shard_no in shard_labels.keys()
    ]
Ejemplo n.º 19
0
def DeleteMeshFilesTask(cloudpath: str,
                        prefix: str,
                        mesh_dir: Optional[str] = None):
    cv = CloudVolume(cloudpath, mesh_dir=mesh_dir)
    cf = CloudFiles(cv.mesh.meta.layerpath)
    cf.delete(cf.list(prefix=prefix))
Ejemplo n.º 20
0
    def get_filenames(self):
        prefix = '{}/{}'.format(self.vol.skeleton.path, self.prefix)

        cf = CloudFiles(self.cloudpath, progress=True)
        return [_ for _ in cf.list(prefix=prefix)]