def test_compress_level(compression_method):
    from cloudfiles import CloudFiles, exceptions
    filepath = "/tmp/cloudfiles/compress_level"
    url = "file://" + filepath

    content = b'some_string' * 1000

    compress_levels = range(1, 9, 2)
    for compress_level in compress_levels:
        cf = CloudFiles(url, num_threads=5)
        cf.put('info',
               content,
               compress=compression_method,
               compression_level=compress_level)

        retrieved = cf.get('info')
        assert content == retrieved

        conn = cf._get_connection()
        _, encoding, server_md5, hash_type = conn.get_file("info")
        assert encoding == compression_method
        assert hash_type in ('md5', None)

        assert cf.get('nonexistentfile') is None

        rmtree(filepath)
Beispiel #2
0
def test_exceptions_raised(green):
  from cloudfiles import CloudFiles, exceptions
  from cloudfiles.lib import mkdir
  path = compute_url("file", "exceptions_raised")
  cf = CloudFiles(path, green=green)

  pth = mkdir(path.replace("file://", ""))
  with open(f"{pth}/wontdecompress.gz", "wb") as f:
    f.write(b"not a valid gzip stream")

  try:
    x = cf.get("wontdecompress")
    assert False
  except exceptions.DecompressionError:
    pass

  try:
    x = cf.get(["wontdecompress"], raise_errors=True)
    assert False
  except exceptions.DecompressionError:
    pass

  try:
    x = cf.get(["wontdecompress"], return_dict=True)
    assert False
  except exceptions.DecompressionError:
    pass

  cf.delete("wontdecompress")
Beispiel #3
0
def test_delete(s3, green, protocol):
  from cloudfiles import CloudFiles, exceptions
  if protocol == 'file':
    url = "file:///tmp/cloudfiles/delete"
  else:
    url = "{}://cloudfiles/delete".format(protocol)

  cf = CloudFiles(url, green=green, num_threads=1)    
  content = b'some_string'
  cf.put('delete-test', content, compress=None, cache_control='no-cache')
  cf.put('delete-test-compressed', content, compress='gzip', cache_control='no-cache')
  assert cf.get('delete-test') == content
  cf.delete('delete-test')
  assert cf.get('delete-test') is None

  assert cf.get('delete-test-compressed') == content
  cf.delete('delete-test-compressed')
  assert cf.get('delete-test-compressed') is None

  # Reset for batch delete
  cf.put('delete-test', content, compress=None, cache_control='no-cache')
  cf.put('delete-test-compressed', content, compress='gzip', cache_control='no-cache')
  assert cf.get('delete-test') == content
  assert cf.get('delete-test-compressed') == content

  cf.delete(['delete-test', 'delete-nonexistent', 'delete-test-compressed'])
  assert cf.get('delete-test') is None
  assert cf.get('delete-test-compressed') is None
Beispiel #4
0
def load_images(p: str, extension: str = "tif") -> dict:
    """Assume directory contains only the images to be stored"""
    files = CloudFiles(p)
    names = []
    for f in sorted(files.list()):
        if extension in f:
            names.append(f)

    files.get(names, raw=True)
    files_bytes = [files[k] for k in names]

    imgs = []
    for f in files_bytes:
        imgs.append(_load_image(f))
    return {"seg": np.asarray(imgs).transpose(2, 1, 0)}
Beispiel #5
0
def MultiResUnshardedMeshMergeTask(
  cloudpath:str, 
  prefix:str,
  cache_control:bool = False,
  draco_compression_level:int = 1,
  mesh_dir:Optional[str] = None,
  num_lod:int = 1,
  progress:bool = False,
):
  cv = CloudVolume(cloudpath)
  
  if mesh_dir is None and 'mesh' in cv.info:
    mesh_dir = cv.info['mesh']

  files_per_label = get_mesh_filenames_subset(
    cloudpath, mesh_dir, prefix
  )

  cf = CloudFiles(cv.meta.join(cloudpath, mesh_dir))
  for label, filenames in tqdm(files_per_label.items(), disable=(not progress)):
    files = cf.get(filenames)
    # we should handle draco as well
    files = [ Mesh.from_precomputed(f["content"]) for f in files ]

    (manifest, mesh) = process_mesh(
      cv, label, files, 
      num_lod, draco_compression_level
    )

    cf.put(f"{label}.index", manifest.to_binary(), cache_control="no-cache")
    cf.put(f"{label}", mesh, cache_control="no-cache")
Beispiel #6
0
    def get_manifest(self, segid, progress=None):
        """Retrieve the manifest for one or more segments."""
        segid, multiple_return = toiter(segid, is_iter=True)
        progress = progress if progress is not None else self.config.progress

        cloudpath = self.meta.join(self.meta.cloudpath, self.path)
        cf = CloudFiles(cloudpath, progress=progress)
        results = cf.get((f"{sid}.index" for sid in segid), total=len(segid))

        if not multiple_return:
            if not results:
                return None
            binary = results[0]["content"]
            if binary is None:
                return None
            return MultiLevelPrecomputedMeshManifest.from_binary(
                binary, segment_id=first(segid), shard_offset=0)

        regexp = re.compile(r'(\d+)\.index$')
        manifests = []
        for res in results:
            key = res["path"]
            sid = int(re.match(regexp, key).groups()[0])
            binary = res["content"]
            if binary is None:
                manifests.append(None)
            manifest = MultiLevelPrecomputedMeshManifest.from_binary(
                binary, segment_id=sid, shard_offset=0)
            manifests.append(manifest)

        return manifests
Beispiel #7
0
  def fetch_z_levels(self, bounds):
    cf = CloudFiles(self.levels_path)

    levelfilenames = [
      cf.join('levels', f"{self.mip}", f"{z}")
      for z in range(bounds.minpt.z, bounds.maxpt.z)
    ]

    levels = cf.get(levelfilenames)

    errors = [
      level['path'] \
      for level in levels if level['content'] == None
    ]

    if len(errors):
      raise Exception(", ".join(
          errors) + " were not defined. Did you run a LuminanceLevelsTask for these slices?")

    levels = [(
      int(os.path.basename(item['path'])),
      json.loads(item['content'].decode('utf-8'))
    ) for item in levels ]

    levels.sort(key=lambda x: x[0])
    levels = [x[1] for x in levels]
    return [ np.array(x['levels'], dtype=np.uint64) for x in levels ]
Beispiel #8
0
def test_access_non_cannonical_minimal_path(s3, protocol):
  from cloudfiles import CloudFiles, exceptions
  if protocol == 'file':
    url = "file:///tmp/"
  else:
    url = "{}://cloudfiles/".format(protocol)
  
  cf = CloudFiles(url, num_threads=5)
  content = b'some_string'
  cf.put('info', content, compress=None)
  
  # time.sleep(0.5) # sometimes it takes a moment for google to update the list
  
  assert cf.get('info') == content
  assert cf.get('nonexistentfile') is None
  cf.delete('info')
Beispiel #9
0
    def get_meshes_on_bypass(self, segids, allow_missing=False):
        """
    Attempt to fetch a mesh directly from storage without going through
    the chunk graph server. This capability should only be used in special
    circumstances.
    """
        segids = toiter(segids)

        dynamic_cloudpath = self.meta.join(self.meta.meta.cloudpath,
                                           self.dynamic_path())
        filenames = [self.compute_filename(segid) for segid in segids]

        cf = CloudFiles(dynamic_cloudpath,
                        progress=self.config.progress,
                        green=self.config.green,
                        secrets=self.config.secrets)
        raw_binaries = cf.get(filenames)

        # extract the label ID from the mesh manifest.
        # e.g. 387463568301300850:0:24576-25088_17920-18432_2048-3072
        label_regexp = re.compile(r'(\d+):\d:[\d_-]+$')

        output = {}
        remaining = []
        for res in raw_binaries:
            if res['error']:
                raise res['error']

            (label, ) = re.search(label_regexp, res['path']).groups()
            label = int(label)

            if res['content'] is None:
                remaining.append(label)
            else:
                output[label] = res['content']

        layers = defaultdict(list)
        for segid in remaining:
            layer_id = self.meta.meta.decode_layer_id(segid)
            layers[layer_id].append(segid)

        for layer_id, labels in layers.items():
            subdirectory = self.meta.join(self.meta.mesh_path, 'initial',
                                          str(layer_id))
            initial_output = self.readers[layer_id].get_data(
                labels, path=subdirectory, progress=self.config.progress)
            for label, raw_binary in initial_output.items():
                if raw_binary is None:
                    if allow_missing:
                        continue
                    else:
                        raise IndexError(
                            'No mesh found for segment {}'.format(label))
                else:
                    output[label] = raw_binary

        return {
            label: Mesh.from_draco(raw_binary, segid=label)
            for label, raw_binary in output.items()
        }
Beispiel #10
0
    def get(self, cloudpaths, progress=None):
        progress = self.config.progress if progress is None else progress

        cf = CloudFiles('file://' + self.path, progress=progress)
        results = cf.get(list(cloudpaths))

        return {res['path']: res['content'] for res in results}
Beispiel #11
0
    def download(self, paths, compress=None, progress=None):
        """
    Download the provided paths, but grab them from cache first
    if they are present and the cache is enabled. 

    Returns: { filename: content, ... }
    """
        if len(paths) == 0:
            return {}

        progress = nvl(progress, self.config.progress)
        compress = nvl(compress, self.compress, self.config.compress)

        locs = self.compute_data_locations(paths)
        locs['remote'] = [str(x) for x in locs['remote']]

        fragments = {}
        if self.enabled:
            fragments = self.get(locs['local'], progress=progress)

        # fixes e.g. mesh\info -> mesh/info on Windows
        if self.meta.path.protocol != 'file' and os.path.sep == '\\':
            fragments = {
                "/".join(key.split('\\')): val
                for key, val in fragments.items()
            }

        cf = CloudFiles(
            self.meta.cloudpath,
            progress=progress,
            secrets=self.config.secrets,
            parallel=self.config.parallel,
        )
        remote_fragments = cf.get(locs['remote'], raw=True)

        for frag in remote_fragments:
            if frag['error'] is not None:
                raise frag['error']

        if self.enabled:
            cf_cache = CloudFiles('file://' + self.path,
                                  progress=('to Cache' if progress else None))
            cf_cache.puts(compression.transcode(
                (frag
                 for frag in remote_fragments if frag['content'] is not None),
                encoding=compress,
                progress=progress,
                in_place=False),
                          compress=compress,
                          raw=True)

        remote_fragments_dict = {}
        while remote_fragments:
            res = remote_fragments.pop()
            remote_fragments_dict[res['path']] = compression.decompress(
                res['content'], res['compress'])

        fragments.update(remote_fragments_dict)
        return fragments
Beispiel #12
0
    def _fetch(self) -> None:
        from json import loads
        from cloudfiles import CloudFiles

        raw_meta = {}
        cf = CloudFiles(self.path)
        fnames = ["params.json", "metadata.json"]
        cf.get(fnames, raw=True)
        for f in fnames:
            try:
                raw_meta = loads(cf[f])
                break
            except:
                pass
        if not raw_meta:
            raise ValueError("Could not load meta, cannot proceed.")
        self._parse(raw_meta)
Beispiel #13
0
def test_http_read_brotli_image():
  from cloudfiles import CloudFiles, exceptions
  cf = CloudFiles('https://open-neurodata.s3.amazonaws.com/kharris15/apical/em')
  
  imgbytes = cf.get("2_2_50/4096-4608_4096-4608_112-128")
  assert len(imgbytes) == 4194304
  
  expected = b'v\\BAT[]\\TVcsxshj{\x84vjo\x7f}oqyz\x89\x92\x91\x98\x81\x99\xb2\xb2\xb1\xa9\x9d\xa3\xb4\xb8'
  assert imgbytes[:len(expected)] == expected
 def get_skeletons(self, folder):
     skeleton_filenames = [str(skeleton_id) for skeleton_id in self.skeleton_ids]
     cf = CloudFiles(folder)
     skeleton_files = cf.get(skeleton_filenames)
     skeletons = {}
     for skeleton_file in skeleton_files:
         skeleton_id_str = skeleton_file["path"]
         skeleton = Skeleton.from_precomputed(skeleton_file["content"])
         skeletons[skeleton_id_str] = skeleton
     return skeletons
Beispiel #15
0
def test_compression(s3, protocol, method, green):
  from cloudfiles import CloudFiles, exceptions
  url = compute_url(protocol, "compress")

  cf = CloudFiles(url, num_threads=5, green=green)
  content = b'some_string'

  cf.put('info', content, compress=method)
  retrieved = cf.get('info')
  assert content == retrieved

  assert cf.get('nonexistentfile') is None

  try:
    cf.put('info', content, compress='nonexistent')
    assert False
  except ValueError:
    pass

  cf.delete(iter(cf))
Beispiel #16
0
    def download(self, bbox, mip, parallel=1, renumber=False):
        if parallel != 1:
            raise ValueError("Only parallel=1 is supported for n5.")
        elif renumber != False:
            raise ValueError("Only renumber=False is supported for n5.")

        bounds = Bbox.clamp(bbox, self.meta.bounds(mip))

        if self.autocrop:
            image, bounds = autocropfn(self.meta, image, bounds, mip)

        if bounds.subvoxel():
            raise exceptions.EmptyRequestException(
                f'Requested less than one pixel of volume. {bounds}')

        cf = CloudFiles(self.meta.cloudpath, progress=self.config.progress)
        realized_bbox = bbox.expand_to_chunk_size(self.meta.chunk_size(mip))
        grid_bbox = realized_bbox // self.meta.chunk_size(mip)

        urls = [
            cf.join(f"s{mip}", str(x), str(y), str(z))
            for x, y, z in xyzrange(grid_bbox.minpt, grid_bbox.maxpt)
        ]

        all_chunks = cf.get(urls, parallel=parallel, return_dict=True)
        shape = list(bbox.size3()) + [self.meta.num_channels]
        renderbuffer = np.zeros(shape=shape, dtype=self.meta.dtype, order='F')

        sep = '/'
        if cf._path.protocol == "file":
            sep = os.path.sep
        if sep == '\\':
            sep = '\\\\'  # compensate for regexp escaping

        regexp = re.compile(
            rf"s(?P<mip>\d+){sep}(?P<x>\d+){sep}(?P<y>\d+){sep}(?P<z>\d+)")
        for fname, binary in all_chunks.items():
            m = re.search(regexp, fname).groupdict()
            assert mip == int(m["mip"])
            gridpoint = Vec(*[int(i) for i in [m["x"], m["y"], m["z"]]])
            chunk_bbox = Bbox(gridpoint,
                              gridpoint + 1) * self.meta.chunk_size(mip)
            chunk_bbox = Bbox.clamp(chunk_bbox, self.meta.bounds(mip))
            default_shape = list(chunk_bbox.size3()) + [self.meta.num_channels]
            chunk, chunk_shape = self.parse_chunk(binary, mip, fname,
                                                  default_shape)
            chunk_bbox = Bbox(chunk_bbox.minpt,
                              chunk_bbox.minpt + Vec(*chunk_shape[:3]))
            chunk_bbox = Bbox.clamp(chunk_bbox, self.meta.bounds(mip))
            shade(renderbuffer, bbox, chunk, chunk_bbox)

        return VolumeCutout.from_volume(self.meta, mip, renderbuffer, bbox)
class AggregateSkeletonFragmentsOperator(OperatorBase):
    """Merge skeleton fragments for Neuroglancer visualization."""
    def __init__(self,
                 fragments_path: str,
                 output_path: str,
                 name: str = 'aggregate-skeleton-fragments'):
        """
        Parameters
        ------------
        fragments_path: 
            path to store fragment files
        output_path:
            save the merged skeleton file here.
        """
        super().__init__(name=name)
        self.fragments_storage = CloudFiles(fragments_path)
        self.output_storage = CloudFiles(output_path)

    def __call__(self, prefix: str):
        logging.info(f'aggregate skeletons with prefix of {prefix}')

        id2filenames = defaultdict(list)
        for filename in self.fragments_storage.list_files(prefix=prefix):
            filename = os.path.basename(filename)
            # `match` implies the beginning (^). `search` matches whole string
            matches = re.search(r'(\d+):', filename)

            if not matches:
                continue

            # skeleton ID
            skl_id = int(matches.group(0)[:-1])
            id2filenames[skl_id].append(filename)

        for skl_id, filenames in id2filenames.items():
            logging.info(f'skeleton id: {skl_id}')
            frags = self.fragments_storage.get(filenames)
            frags = [
                PrecomputedSkeleton.from_precomputed(x['content'])
                for x in frags
            ]
            skel = PrecomputedSkeleton.simple_merge(frags).consolidate()
            skel = kimimaro.postprocess(skel,
                                        dust_threshold=1000,
                                        tick_threshold=3500)
            self.output_storage.put(
                file_path=str(skl_id),
                content=skel.to_precomputed(),
            )
            # the last few hundred files will not be uploaded without sleeping!
            sleep(0.01)
def test_read_write(s3, protocol, num_threads, green):
    from cloudfiles import CloudFiles, exceptions
    url = compute_url(protocol, "rw")

    cf = CloudFiles(url, num_threads=num_threads, green=green)

    content = b'some_string'
    cf.put('info', content, compress=None, cache_control='no-cache')
    cf['info2'] = content

    assert cf.get('info') == content
    assert cf['info2'] == content
    assert cf['info2', 0:3] == content[0:3]
    assert cf['info2', :] == content[:]
    assert cf.get('nonexistentfile') is None

    del cf['info2']
    assert cf.exists('info2') == False

    num_infos = max(num_threads, 1)
    results = cf.get(['info' for i in range(num_infos)])

    assert len(results) == num_infos
    assert results[0]['path'] == 'info'
    assert results[0]['content'] == content
    assert all(map(lambda x: x['error'] is None, results))
    assert cf.get(['nonexistentfile'])[0]['content'] is None

    cf.delete('info')

    cf.put_json('info', {'omg': 'wow'}, cache_control='no-cache')
    results = cf.get_json('info')
    assert results == {'omg': 'wow'}

    cf.delete('info')

    if protocol == 'file':
        rmtree(url)
Beispiel #19
0
    def execute(self):

        corgie_logger.info(
            f"Generate new skeleton vertices task for id {self.skeleton_id_str}"
        )
        skeleton = get_skeleton(self.src_path, self.skeleton_id_str)
        if self.vertex_sort:
            vertex_sort = skeleton.vertices[:, 2].argsort()
        else:
            vertex_sort = np.arange(0, len(skeleton.vertices))
        number_vertices = len(skeleton.vertices)
        index_points = list(range(0, number_vertices, self.task_vertex_size))
        cf = CloudFiles(f"{self.dst_path}")
        array_filenames = []
        for i in range(len(index_points)):
            start_index = index_points[i]
            if i + 1 == len(index_points):
                end_index = number_vertices
            else:
                end_index = index_points[i + 1]
            array_filenames.append(
                f"intermediary_arrays/{self.skeleton_id_str}:{start_index}-{end_index}"
            )
        array_files = cf.get(array_filenames)
        # Dict to make sure arrays are concatenated in correct order
        array_dict = {}
        for array_file in array_files:
            array_dict[array_file["path"]] = pickle.loads(
                array_file["content"])
        array_arrays = []
        for array_filename in array_filenames:
            array_arrays.append(array_dict[array_filename])
        array_arrays = np.concatenate(array_arrays)
        # Restore the correct order of the vertices
        restore_sort = vertex_sort.argsort()
        new_vertices = array_arrays[restore_sort]
        new_skeleton = Skeleton(
            vertices=new_vertices,
            edges=skeleton.edges,
            radii=skeleton.radius,
            vertex_types=skeleton.vertex_types,
            space=skeleton.space,
            transform=skeleton.transform,
        )
        cf.put(
            path=self.skeleton_id_str,
            content=new_skeleton.to_precomputed(),
            compress="gzip",
        )
Beispiel #20
0
    def download(self, paths, compress=None, progress=None):
        """
    Download the provided paths, but grab them from cache first
    if they are present and the cache is enabled. 

    Returns: { filename: content, ... }
    """
        if len(paths) == 0:
            return {}

        progress = nvl(progress, self.config.progress)
        compress = nvl(compress, self.compress, self.config.compress)

        locs = self.compute_data_locations(paths)
        locs['remote'] = [str(x) for x in locs['remote']]

        fragments = {}
        if self.enabled:
            fragments = self.get(locs['local'], progress=progress)

        cf = CloudFiles(self.meta.cloudpath,
                        progress=progress,
                        secrets=self.config.secrets)
        remote_fragments = cf.get(locs['remote'], raw=True)

        for frag in remote_fragments:
            if frag['error'] is not None:
                raise frag['error']

        if self.enabled:
            cf_cache = CloudFiles('file://' + self.path,
                                  progress=('to Cache' if progress else None))
            cf_cache.puts(compression.transcode(
                (frag
                 for frag in remote_fragments if frag['content'] is not None),
                encoding=compress,
                progress=progress,
                in_place=False),
                          compress=compress,
                          raw=True)

        remote_fragments = {
          res['path']: compression.decompress(res['content'], res['compress']) \
          for res in remote_fragments
        }

        fragments.update(remote_fragments)
        return fragments
    def get_skeletons_by_segid(self, filenames):
        cf = CloudFiles(self.cloudpath, progress=True)
        skels = cf.get(filenames)

        skeletons = defaultdict(list)
        for skel in skels:
            try:
                segid = filename_to_segid(skel['filename'])
            except ValueError:
                # Typically this is due to preexisting fully
                # formed skeletons e.g. skeletons_mip_3/1588494
                continue

            skeletons[segid].append((Bbox.from_filename(skel['filename']),
                                     pickle.loads(skel['content'])))

        return skeletons
Beispiel #22
0
def cache(task, cloudpath):
    layer_path, filename = os.path.split(cloudpath)

    classname = task.__class__.__name__
    lcldir = mkdir(os.path.join('/tmp/', classname))
    lclpath = os.path.join(lcldir, filename)

    if os.path.exists(lclpath):
        with open(lclpath, 'rb') as f:
            filestr = f.read()
    else:
        cf = CloudFiles(layer_path)
        filestr = cf.get(filename)

        with open(lclpath, 'wb') as f:
            f.write(filestr)

    return filestr
    def download(self, paths, compress=None, progress=None):
        """
    Download the provided paths, but grab them from cache first
    if they are present and the cache is enabled. 

    Returns: { filename: content, ... }
    """
        if len(paths) == 0:
            return {}

        progress = progress if progress is not None else self.config.progress

        locs = self.compute_data_locations(paths)
        locs['remote'] = [str(x) for x in locs['remote']]

        fragments = {}
        if self.enabled:
            fragments = self.get(locs['local'], progress=progress)

        cf = CloudFiles(self.meta.cloudpath, progress=progress)
        remote_fragments = cf.get(locs['remote'])

        for frag in remote_fragments:
            if frag['error'] is not None:
                raise frag['error']

        remote_fragments = {
          res['path']: res['content'] \
          for res in remote_fragments
        }

        if self.enabled:
            self.put(
              [
                (filename, content) for filename, content in remote_fragments.items() \
                if content is not None
              ],
              compress=compress,
              progress=progress
            )

        fragments.update(remote_fragments)
        return fragments
Beispiel #24
0
class MergeSkeletonTask(scheduling.Task):
    def __init__(self,
                 dst_path,
                 mip,
                 dust_threshold,
                 tick_threshold,
                 prefix=""):
        super().__init__(self)
        self.dst_path = dst_path
        self.cf = CloudFiles(self.dst_path)
        self.mip = mip
        self.dust_threshold = dust_threshold
        self.tick_threshold = tick_threshold
        self.prefix = prefix

    def execute(self):
        corgie_logger.info(f"Merging skeletons at {self.dst_path}")
        fragment_filenames = self.cf.list(prefix=self.prefix, flat=True)
        skeleton_files = self.cf.get(fragment_filenames)
        skeletons = defaultdict(list)
        for skeleton_file in skeleton_files:
            try:
                colon_index = skeleton_file["path"].index(":")
            except ValueError:
                # File is full skeleton, not fragment
                continue
            seg_id = skeleton_file["path"][0:colon_index]
            skeleton_fragment = pickle.loads(skeleton_file["content"])
            if not skeleton_fragment.empty():
                skeletons[seg_id].append(skeleton_fragment)
        for seg_id, skeleton_fragments in skeletons.items():
            skeleton = PrecomputedSkeleton.simple_merge(
                skeleton_fragments).consolidate()
            skeleton = kimimaro.postprocess(skeleton, self.dust_threshold,
                                            self.tick_threshold)
            skeleton.id = int(seg_id)
            self.cf.put(path=seg_id,
                        content=skeleton.to_precomputed(),
                        compress="gzip")
            corgie_logger.info(f"Finished skeleton {seg_id}")
Beispiel #25
0
    def fetch_provenance(self):
        """
    Refresh the current provenance file from primary storage (e.g. the cloud)
    without reference to cache. The cache will not be updated.
  
    Raises cloudvolume.exceptions.provenanceUnavailableError when the info file 
    is unable to be retrieved.

    See also: refresh_provenance

    Returns: dict
    """
        cf = CloudFiles(self.cloudpath)
        provfile = cf.get('provenance')
        if provfile:
            provfile = provfile.decode('utf-8')

            # The json5 decoder is *very* slow
            # so use the stricter but much faster json
            # decoder first, and try it only if it fails.
            try:
                provfile = json.loads(provfile)
            except json.decoder.JSONDecodeError:
                try:
                    provfile = json5.loads(provfile)
                except ValueError:
                    raise ValueError(
                        red("""The provenance file could not be JSON decoded. 
            Please reformat the provenance file before continuing. 
            Contents: {}""".format(provfile)))
        else:
            provfile = {
                "sources": [],
                "owners": [],
                "processing": [],
                "description": "",
            }

        return self._cast_provenance(provfile)
Beispiel #26
0
def test_get_generator(num_threads, green):
  from cloudfiles import CloudFiles, exceptions
  path = '/tmp/cloudfiles/gen'
  rmtree(path)
  url = 'file://' + path

  cf = CloudFiles(url, num_threads=num_threads, green=green)

  gen = ( (str(i), b'hello world') for i in range(100) )
  cf.puts(gen)

  files = cf.get(( str(i) for i in range(100) ), total=100)

  assert all([ f['error'] is None for f in files ])
  assert len(files) == 100
  assert all([ f['content'] == b'hello world' for f in files ])

  fnames = [ str(i) for i in range(100) ]
  assert sorted(list(cf.list())) == sorted(fnames)

  cf.delete(( str(i) for i in range(100) ))
  assert list(cf.list()) == []
def get_skeleton(src_path, skeleton_id_str):
    cf = CloudFiles(src_path)
    return Skeleton.from_precomputed(cf.get(skeleton_id_str))
def _cp_single(ctx, source, destination, recursive, compression, progress,
               block_size):
    use_stdin = (source == '-')

    nsrc = normalize_path(source)
    ndest = normalize_path(destination)

    ctx.ensure_object(dict)
    parallel = int(ctx.obj.get("parallel", 1))

    issrcdir = ispathdir(source) and use_stdin == False
    isdestdir = ispathdir(destination)

    srcpath = nsrc if issrcdir else os.path.dirname(nsrc)
    many, flat, prefix = get_mfp(nsrc, recursive)

    if issrcdir and not many:
        print(f"cloudfiles: {source} is a directory (not copied).")
        return

    xferpaths = os.path.basename(nsrc)
    if use_stdin:
        xferpaths = sys.stdin.readlines()
        xferpaths = [x.replace("\n", "") for x in xferpaths]
        prefix = os.path.commonprefix(xferpaths)
        xferpaths = [x.replace(prefix, "") for x in xferpaths]
        srcpath = cloudpathjoin(srcpath, prefix)
    elif many:
        xferpaths = CloudFiles(srcpath, green=True).list(prefix=prefix,
                                                         flat=flat)

    destpath = ndest
    if isinstance(xferpaths, str):
        destpath = ndest if isdestdir else os.path.dirname(ndest)
    elif not isdestdir:
        if os.path.exists(ndest.replace("file://", "")):
            print(f"cloudfiles: {ndest} is not a directory (not copied).")
            return

    if compression == "same":
        compression = None
    elif compression == "none":
        compression = False

    if not isinstance(xferpaths, str):
        if parallel == 1:
            _cp(srcpath, destpath, compression, progress, block_size,
                xferpaths)
            return

        total = None
        try:
            total = len(xferpaths)
        except TypeError:
            pass

        fn = partial(_cp, srcpath, destpath, compression, False, block_size)
        with tqdm(desc="Transferring", total=total,
                  disable=(not progress)) as pbar:
            with pathos.pools.ProcessPool(parallel) as executor:
                for _ in executor.imap(fn, sip(xferpaths, block_size)):
                    pbar.update(block_size)
    else:
        cfsrc = CloudFiles(srcpath, green=True, progress=progress)
        if not cfsrc.exists(xferpaths):
            print(
                f"cloudfiles: source path not found: {cfsrc.abspath(xferpaths).replace('file://','')}"
            )
            return

        downloaded = cfsrc.get(xferpaths, raw=True)
        if compression is not None:
            downloaded = transcode(downloaded, compression, in_place=True)

        cfdest = CloudFiles(destpath, green=True, progress=progress)
        if isdestdir:
            cfdest.put(os.path.basename(nsrc), downloaded, raw=True)
        else:
            cfdest.put(os.path.basename(ndest), downloaded, raw=True)
Beispiel #29
0
def _cp_single(ctx, source, destination, recursive, compression, progress,
               block_size):
    use_stdin = (source == '-')
    use_stdout = (destination == '-')

    if use_stdout:
        progress = False  # can't have the progress bar interfering

    nsrc = normalize_path(source)
    ndest = normalize_path(destination)

    # For more information see:
    # https://cloud.google.com/storage/docs/gsutil/commands/cp#how-names-are-constructed
    # Try to follow cp rules. If the directory exists,
    # copy the base source directory into the dest directory
    # If the directory does not exist, then we copy into
    # the dest directory.
    # Both x* and x** should not copy the base directory
    if recursive and nsrc[-1] != "*":
        if CloudFiles(ndest).isdir():
            if nsrc[-1] == '/':
                nsrc = nsrc[:-1]
            ndest = cloudpathjoin(ndest, os.path.basename(nsrc))

    ctx.ensure_object(dict)
    parallel = int(ctx.obj.get("parallel", 1))

    issrcdir = ispathdir(source) and use_stdin == False
    isdestdir = ispathdir(destination)

    srcpath = nsrc if issrcdir else os.path.dirname(nsrc)
    many, flat, prefix = get_mfp(nsrc, recursive)

    if issrcdir and not many:
        print(f"cloudfiles: {source} is a directory (not copied).")
        return

    xferpaths = os.path.basename(nsrc)
    if use_stdin:
        xferpaths = sys.stdin.readlines()
        xferpaths = [x.replace("\n", "") for x in xferpaths]
        prefix = os.path.commonprefix(xferpaths)
        xferpaths = [x.replace(prefix, "") for x in xferpaths]
        srcpath = cloudpathjoin(srcpath, prefix)
    elif many:
        xferpaths = CloudFiles(srcpath, green=True).list(prefix=prefix,
                                                         flat=flat)

    destpath = ndest
    if isinstance(xferpaths, str):
        destpath = ndest if isdestdir else os.path.dirname(ndest)
    elif not isdestdir:
        if os.path.exists(ndest.replace("file://", "")):
            print(f"cloudfiles: {ndest} is not a directory (not copied).")
            return

    if compression == "same":
        compression = None
    elif compression == "none":
        compression = False

    if not isinstance(xferpaths, str):
        if parallel == 1:
            _cp(srcpath, destpath, compression, progress, block_size,
                xferpaths)
            return

        total = None
        try:
            total = len(xferpaths)
        except TypeError:
            pass

        if use_stdout:
            fn = partial(_cp_stdout, srcpath)
        else:
            fn = partial(_cp, srcpath, destpath, compression, False,
                         block_size)

        with tqdm(desc="Transferring", total=total,
                  disable=(not progress)) as pbar:
            with pathos.pools.ProcessPool(parallel) as executor:
                for _ in executor.imap(fn, sip(xferpaths, block_size)):
                    pbar.update(block_size)
    else:
        cfsrc = CloudFiles(srcpath, green=True, progress=progress)
        if not cfsrc.exists(xferpaths):
            print(
                f"cloudfiles: source path not found: {cfsrc.abspath(xferpaths).replace('file://','')}"
            )
            return

        if use_stdout:
            _cp_stdout(srcpath, xferpaths)
            return

        downloaded = cfsrc.get(xferpaths, raw=True)
        if compression is not None:
            downloaded = transcode(downloaded, compression, in_place=True)

        cfdest = CloudFiles(destpath, green=True, progress=progress)
        if isdestdir:
            cfdest.put(os.path.basename(nsrc), downloaded, raw=True)
        else:
            cfdest.put(os.path.basename(ndest), downloaded, raw=True)
Beispiel #30
0
def _cp_stdout(src, paths):
    paths = toiter(paths)
    cf = CloudFiles(src, green=True, progress=False)
    for res in cf.get(paths):
        content = res["content"].decode("utf8")
        sys.stdout.write(content)