Example #1
0
def download_chunk(meta, cache, cloudpath, mip, filename, fill_missing,
                   enable_cache, compress_cache, secrets, background_color):
    (file, ) = CloudFiles(cloudpath, secrets=secrets).get([filename], raw=True)
    content = file['content']

    if enable_cache:
        cache_content = next(compression.transcode(file,
                                                   compress_cache))['content']
        CloudFiles('file://' + cache.path).put(
            path=filename,
            content=(cache_content or b''),
            content_type=content_type(meta.encoding(mip)),
            compress=compress_cache,
            raw=bool(cache_content),
        )
        del cache_content

    if content is not None:
        content = compression.decompress(content, file['compress'])

    bbox = Bbox.from_filename(
        filename)  # possible off by one error w/ exclusive bounds
    img3d = decode(meta,
                   filename,
                   content,
                   fill_missing,
                   mip,
                   background_color=background_color)
    return img3d, bbox
Example #2
0
    def upload(self,
               files,
               compress,
               cache_control,
               compress_level=None,
               content_type=None,
               progress=None):
        files = list(files)

        progress = progress if progress is not None else self.config.progress

        cf = CloudFiles(self.meta.cloudpath,
                        progress=progress,
                        secrets=self.config.secrets)
        files = list(
            compression.transcode(files,
                                  encoding=compress,
                                  level=compress_level))
        cf.puts(
            files,
            compress=compress,
            compression_level=compress_level,
            cache_control=cache_control,
            content_type=content_type,
            raw=True,
        )

        if self.enabled:
            self.put(files, compress=compress)
            cf_cache = CloudFiles('file://' + self.path,
                                  progress=('to Cache' if progress else None))
            cf_cache.puts(files, compress=compress, raw=True)
def test_transcode(dest_encoding):
    from cloudfiles import CloudFiles, compression
    base_text = b'hello world'
    encodings = [None, "gzip", "br", "zstd"]

    varied_texts = []

    ans = compression.compress(base_text, dest_encoding)

    for i in range(200):
        src_encoding = encodings[i % len(encodings)]
        varied_texts.append({
            "path":
            str(i),
            "content":
            compression.compress(base_text, src_encoding),
            "raw":
            src_encoding is not None,
            "compress":
            src_encoding,
        })

    transcoded = (x['content']
                  for x in compression.transcode(varied_texts, dest_encoding))
    for content in transcoded:
        assert content == ans
Example #4
0
    def download(self, paths, compress=None, progress=None):
        """
    Download the provided paths, but grab them from cache first
    if they are present and the cache is enabled. 

    Returns: { filename: content, ... }
    """
        if len(paths) == 0:
            return {}

        progress = nvl(progress, self.config.progress)
        compress = nvl(compress, self.compress, self.config.compress)

        locs = self.compute_data_locations(paths)
        locs['remote'] = [str(x) for x in locs['remote']]

        fragments = {}
        if self.enabled:
            fragments = self.get(locs['local'], progress=progress)

        # fixes e.g. mesh\info -> mesh/info on Windows
        if self.meta.path.protocol != 'file' and os.path.sep == '\\':
            fragments = {
                "/".join(key.split('\\')): val
                for key, val in fragments.items()
            }

        cf = CloudFiles(
            self.meta.cloudpath,
            progress=progress,
            secrets=self.config.secrets,
            parallel=self.config.parallel,
        )
        remote_fragments = cf.get(locs['remote'], raw=True)

        for frag in remote_fragments:
            if frag['error'] is not None:
                raise frag['error']

        if self.enabled:
            cf_cache = CloudFiles('file://' + self.path,
                                  progress=('to Cache' if progress else None))
            cf_cache.puts(compression.transcode(
                (frag
                 for frag in remote_fragments if frag['content'] is not None),
                encoding=compress,
                progress=progress,
                in_place=False),
                          compress=compress,
                          raw=True)

        remote_fragments_dict = {}
        while remote_fragments:
            res = remote_fragments.pop()
            remote_fragments_dict[res['path']] = compression.decompress(
                res['content'], res['compress'])

        fragments.update(remote_fragments_dict)
        return fragments
Example #5
0
    def download(self, paths, compress=None, progress=None):
        """
    Download the provided paths, but grab them from cache first
    if they are present and the cache is enabled. 

    Returns: { filename: content, ... }
    """
        if len(paths) == 0:
            return {}

        progress = nvl(progress, self.config.progress)
        compress = nvl(compress, self.compress, self.config.compress)

        locs = self.compute_data_locations(paths)
        locs['remote'] = [str(x) for x in locs['remote']]

        fragments = {}
        if self.enabled:
            fragments = self.get(locs['local'], progress=progress)

        cf = CloudFiles(self.meta.cloudpath,
                        progress=progress,
                        secrets=self.config.secrets)
        remote_fragments = cf.get(locs['remote'], raw=True)

        for frag in remote_fragments:
            if frag['error'] is not None:
                raise frag['error']

        if self.enabled:
            cf_cache = CloudFiles('file://' + self.path,
                                  progress=('to Cache' if progress else None))
            cf_cache.puts(compression.transcode(
                (frag
                 for frag in remote_fragments if frag['content'] is not None),
                encoding=compress,
                progress=progress,
                in_place=False),
                          compress=compress,
                          raw=True)

        remote_fragments = {
          res['path']: compression.decompress(res['content'], res['compress']) \
          for res in remote_fragments
        }

        fragments.update(remote_fragments)
        return fragments
Example #6
0
def _cp_single(ctx, source, destination, recursive, compression, progress,
               block_size):
    use_stdin = (source == '-')
    use_stdout = (destination == '-')

    if use_stdout:
        progress = False  # can't have the progress bar interfering

    nsrc = normalize_path(source)
    ndest = normalize_path(destination)

    # For more information see:
    # https://cloud.google.com/storage/docs/gsutil/commands/cp#how-names-are-constructed
    # Try to follow cp rules. If the directory exists,
    # copy the base source directory into the dest directory
    # If the directory does not exist, then we copy into
    # the dest directory.
    # Both x* and x** should not copy the base directory
    if recursive and nsrc[-1] != "*":
        if CloudFiles(ndest).isdir():
            if nsrc[-1] == '/':
                nsrc = nsrc[:-1]
            ndest = cloudpathjoin(ndest, os.path.basename(nsrc))

    ctx.ensure_object(dict)
    parallel = int(ctx.obj.get("parallel", 1))

    issrcdir = ispathdir(source) and use_stdin == False
    isdestdir = ispathdir(destination)

    srcpath = nsrc if issrcdir else os.path.dirname(nsrc)
    many, flat, prefix = get_mfp(nsrc, recursive)

    if issrcdir and not many:
        print(f"cloudfiles: {source} is a directory (not copied).")
        return

    xferpaths = os.path.basename(nsrc)
    if use_stdin:
        xferpaths = sys.stdin.readlines()
        xferpaths = [x.replace("\n", "") for x in xferpaths]
        prefix = os.path.commonprefix(xferpaths)
        xferpaths = [x.replace(prefix, "") for x in xferpaths]
        srcpath = cloudpathjoin(srcpath, prefix)
    elif many:
        xferpaths = CloudFiles(srcpath, green=True).list(prefix=prefix,
                                                         flat=flat)

    destpath = ndest
    if isinstance(xferpaths, str):
        destpath = ndest if isdestdir else os.path.dirname(ndest)
    elif not isdestdir:
        if os.path.exists(ndest.replace("file://", "")):
            print(f"cloudfiles: {ndest} is not a directory (not copied).")
            return

    if compression == "same":
        compression = None
    elif compression == "none":
        compression = False

    if not isinstance(xferpaths, str):
        if parallel == 1:
            _cp(srcpath, destpath, compression, progress, block_size,
                xferpaths)
            return

        total = None
        try:
            total = len(xferpaths)
        except TypeError:
            pass

        if use_stdout:
            fn = partial(_cp_stdout, srcpath)
        else:
            fn = partial(_cp, srcpath, destpath, compression, False,
                         block_size)

        with tqdm(desc="Transferring", total=total,
                  disable=(not progress)) as pbar:
            with pathos.pools.ProcessPool(parallel) as executor:
                for _ in executor.imap(fn, sip(xferpaths, block_size)):
                    pbar.update(block_size)
    else:
        cfsrc = CloudFiles(srcpath, green=True, progress=progress)
        if not cfsrc.exists(xferpaths):
            print(
                f"cloudfiles: source path not found: {cfsrc.abspath(xferpaths).replace('file://','')}"
            )
            return

        if use_stdout:
            _cp_stdout(srcpath, xferpaths)
            return

        downloaded = cfsrc.get(xferpaths, raw=True)
        if compression is not None:
            downloaded = transcode(downloaded, compression, in_place=True)

        cfdest = CloudFiles(destpath, green=True, progress=progress)
        if isdestdir:
            cfdest.put(os.path.basename(nsrc), downloaded, raw=True)
        else:
            cfdest.put(os.path.basename(ndest), downloaded, raw=True)
def _cp_single(ctx, source, destination, recursive, compression, progress,
               block_size):
    use_stdin = (source == '-')

    nsrc = normalize_path(source)
    ndest = normalize_path(destination)

    ctx.ensure_object(dict)
    parallel = int(ctx.obj.get("parallel", 1))

    issrcdir = ispathdir(source) and use_stdin == False
    isdestdir = ispathdir(destination)

    srcpath = nsrc if issrcdir else os.path.dirname(nsrc)
    many, flat, prefix = get_mfp(nsrc, recursive)

    if issrcdir and not many:
        print(f"cloudfiles: {source} is a directory (not copied).")
        return

    xferpaths = os.path.basename(nsrc)
    if use_stdin:
        xferpaths = sys.stdin.readlines()
        xferpaths = [x.replace("\n", "") for x in xferpaths]
        prefix = os.path.commonprefix(xferpaths)
        xferpaths = [x.replace(prefix, "") for x in xferpaths]
        srcpath = cloudpathjoin(srcpath, prefix)
    elif many:
        xferpaths = CloudFiles(srcpath, green=True).list(prefix=prefix,
                                                         flat=flat)

    destpath = ndest
    if isinstance(xferpaths, str):
        destpath = ndest if isdestdir else os.path.dirname(ndest)
    elif not isdestdir:
        if os.path.exists(ndest.replace("file://", "")):
            print(f"cloudfiles: {ndest} is not a directory (not copied).")
            return

    if compression == "same":
        compression = None
    elif compression == "none":
        compression = False

    if not isinstance(xferpaths, str):
        if parallel == 1:
            _cp(srcpath, destpath, compression, progress, block_size,
                xferpaths)
            return

        total = None
        try:
            total = len(xferpaths)
        except TypeError:
            pass

        fn = partial(_cp, srcpath, destpath, compression, False, block_size)
        with tqdm(desc="Transferring", total=total,
                  disable=(not progress)) as pbar:
            with pathos.pools.ProcessPool(parallel) as executor:
                for _ in executor.imap(fn, sip(xferpaths, block_size)):
                    pbar.update(block_size)
    else:
        cfsrc = CloudFiles(srcpath, green=True, progress=progress)
        if not cfsrc.exists(xferpaths):
            print(
                f"cloudfiles: source path not found: {cfsrc.abspath(xferpaths).replace('file://','')}"
            )
            return

        downloaded = cfsrc.get(xferpaths, raw=True)
        if compression is not None:
            downloaded = transcode(downloaded, compression, in_place=True)

        cfdest = CloudFiles(destpath, green=True, progress=progress)
        if isdestdir:
            cfdest.put(os.path.basename(nsrc), downloaded, raw=True)
        else:
            cfdest.put(os.path.basename(ndest), downloaded, raw=True)
Example #8
0
    def transfer_to(self,
                    cloudpath,
                    bbox,
                    mip,
                    block_size=None,
                    compress=True,
                    compress_level=None):
        """
    Transfer files from one storage location to another, bypassing
    volume painting. This enables using a single CloudVolume instance
    to transfer big volumes. In some cases, gsutil or aws s3 cli tools
    may be more appropriate. This method is provided for convenience. It
    may be optimized for better performance over time as demand requires.

    cloudpath (str): path to storage layer
    bbox (Bbox object): ROI to transfer
    mip (int): resolution level
    block_size (int): number of file chunks to transfer per I/O batch.
    compress (bool): Set to False to upload as uncompressed
    """
        from cloudvolume import CloudVolume

        if mip is None:
            mip = self.config.mip

        if self.is_sharded(mip):
            raise exceptions.UnsupportedFormatError(
                f"Sharded sources are not supported. got: {self.meta.cloudpath}"
            )

        bbox = Bbox.create(bbox, self.meta.bounds(mip))
        realized_bbox = bbox.expand_to_chunk_size(
            self.meta.chunk_size(mip), offset=self.meta.voxel_offset(mip))
        realized_bbox = Bbox.clamp(realized_bbox, self.meta.bounds(mip))

        if bbox != realized_bbox:
            raise exceptions.AlignmentError(
                "Unable to transfer non-chunk aligned bounding boxes. Requested: {}, Realized: {}"
                .format(bbox, realized_bbox))

        default_block_size_MB = 50  # MB
        chunk_MB = self.meta.chunk_size(mip).rectVolume() * np.dtype(
            self.meta.dtype).itemsize * self.meta.num_channels
        if self.meta.layer_type == 'image':
            # kind of an average guess for some EM datasets, have seen up to 1.9x and as low as 1.1
            # affinites are also images, but have very different compression ratios. e.g. 3x for kempressed
            chunk_MB /= 1.3
        else:  # segmentation
            chunk_MB /= 100.0  # compression ratios between 80 and 800....
        chunk_MB /= 1024.0 * 1024.0

        if block_size:
            step = block_size
        else:
            step = int(default_block_size_MB // chunk_MB) + 1

        try:
            destvol = CloudVolume(cloudpath, mip=mip)
        except exceptions.InfoUnavailableError:
            destvol = CloudVolume(cloudpath,
                                  mip=mip,
                                  info=self.meta.info,
                                  provenance=self.meta.provenance.serialize())
            destvol.commit_info()
            destvol.commit_provenance()
        except exceptions.ScaleUnavailableError:
            destvol = CloudVolume(cloudpath)
            for i in range(len(destvol.scales) + 1, len(self.meta.scales)):
                destvol.scales.append(self.meta.scales[i])
            destvol.commit_info()
            destvol.commit_provenance()

        if destvol.image.is_sharded(mip):
            raise exceptions.UnsupportedFormatError(
                f"Sharded destinations are not supported. got: {destvol.cloudpath}"
            )

        num_blocks = np.ceil(
            self.meta.bounds(mip).volume() /
            self.meta.chunk_size(mip).rectVolume()) / step
        num_blocks = int(np.ceil(num_blocks))

        cloudpaths = chunknames(bbox,
                                self.meta.bounds(mip),
                                self.meta.key(mip),
                                self.meta.chunk_size(mip),
                                protocol=self.meta.path.protocol)

        pbar = tqdm(
            desc='Transferring Blocks of {} Chunks'.format(step),
            unit='blocks',
            disable=(not self.config.progress),
            total=num_blocks,
        )

        cfsrc = CloudFiles(self.meta.cloudpath, secrets=self.config.secrets)
        cfdest = CloudFiles(cloudpath)

        def check(files):
            errors = [
              file for file in files if \
              (file['content'] is None or file['error'] is not None)
            ]
            if errors:
                error_paths = [f['path'] for f in errors]
                raise exceptions.EmptyFileException(
                    "{} were empty or had IO errors.".format(
                        ", ".join(error_paths)))
            return files

        with pbar:
            for srcpaths in sip(cloudpaths, step):
                files = check(cfsrc.get(srcpaths, raw=True))
                cfdest.puts(compression.transcode(files,
                                                  encoding=compress,
                                                  level=compress_level,
                                                  in_place=True),
                            compress=compress,
                            content_type=tx.content_type(destvol),
                            raw=True)
                pbar.update()