def validated_gcs_bucket_name(self) -> str:
        if self._validated_gcs_bucket_name is None:
            if self.gcs_bucket_name is not None:
                bucket = self.gcs_bucket_name

            else:
                # Open the key to get the project id
                with open(self.google_credentials_file, "r") as open_resource:
                    creds = json.load(open_resource)
                    project_id = creds["project_id"]

                # Remove all files in bucket
                bucket = f"{project_id}.appspot.com"

            # Validate
            fs = GCSFileSystem(token=self.google_credentials_file)
            try:
                fs.ls(bucket)
                self._validated_gcs_bucket_name = bucket

            except FileNotFoundError:
                raise ValueError(
                    f"Provided or infered GCS bucket name does not exist. ('{bucket}')"
                )

        return self._validated_gcs_bucket_name
Exemple #2
0
def open_and_combine_lat_lon_data(folder, tiles=None):
    """
    Load lat lon data stored as 10x10 degree tiles in folder
    If tiles is none, load all data available
    If no file is available, return None
    """
    fs = GCSFileSystem(cache_timeout=0)
    if not tiles:
        tiles = [
            os.path.splitext(os.path.split(path)[-1])[0]
            for path in fs.ls(folder) if not path.endswith('/')
        ]

    uris = [f'{folder}{tile}.zarr' for tile in tiles]
    ds_list = []
    for uri in uris:
        if fs.exists(uri):
            da = open_zarr_file(uri)
            if da.lat[0] > da.lat[-1]:
                da = da.reindex(lat=da.lat[::-1])
            if da.lon[0] > da.lon[-1]:
                da = da.reindex(lat=da.lon[::-1])
            ds_list.append(da)

    if len(ds_list) > 0:
        ds = xr.combine_by_coords(ds_list,
                                  combine_attrs="drop_conflicts").chunk({
                                      'lat':
                                      2000,
                                      'lon':
                                      2000
                                  })
        return ds
    # print(f'No data available at {folder} for tiles {tiles}')
    return None
Exemple #3
0
def find_tiles_for_bounding_box(min_lat, max_lat, min_lon, max_lon):
    """
    return a list of 10x10 degree tile names covering the bounding box
    the tile names are in the format of {lat}_{lon} where lat, lon represent the upper left corner
    ocean tiles are removed
    """
    fs = GCSFileSystem(cache_timeout=0)
    folder = 'gs://carbonplan-climatetrace/intermediates/ecoregions_mask/'
    available_tiles = [
        os.path.splitext(os.path.split(path)[-1])[0] for path in fs.ls(folder)
        if not path.endswith('/')
    ]

    step = 10
    lat_start = math.ceil(min_lat / step) * step
    lat_stop = math.ceil(max_lat / step) * step
    all_lat_tiles = np.arange(start=lat_start, stop=lat_stop + 1, step=step)
    if min_lat == lat_start:
        all_lat_tiles = all_lat_tiles[1:]

    lon_start = math.floor(min_lon / step) * step
    lon_stop = math.floor(max_lon / step) * step
    all_lon_tiles = np.arange(start=lon_start, stop=lon_stop + 1, step=step)
    if max_lon == lon_stop:
        all_lon_tiles = all_lon_tiles[:-1]

    out = []
    for lat in all_lat_tiles:
        for lon in all_lon_tiles:
            lat_tag, lon_tag = get_lat_lon_tags_from_bounding_box(lat, lon)
            fn = f'{lat_tag}_{lon_tag}'
            if fn in available_tiles:
                out.append(fn)

    return out
Exemple #4
0
def open_glah01_data():
    fs = GCSFileSystem(cache_timeout=0)
    uris = [
        f'gs://{f}'
        for f in fs.ls('gs://carbonplan-climatetrace/intermediates/glah01/')
        if not f.endswith('/')
    ]
    ds_list = [open_zarr_file(uri) for uri in uris]
    ds = xr.concat(ds_list, dim='record_index').chunk({'record_index': 2000})
    for k in ds:
        _ = ds[k].encoding.pop('chunks', None)
    return ds
Exemple #5
0
class GCSFS(Operations):
    def __init__(self, path='.', gcs=None, **fsargs):
        if gcs is None:
            self.gcs = GCSFileSystem(**fsargs)
        else:
            self.gcs = gcs
        self.cache = {}
        self.counter = 0
        self.root = path

    def getattr(self, path, fh=None):
        try:
            info = self.gcs.info(''.join([self.root, path]))
        except FileNotFoundError:
            raise FuseOSError(ENOENT)
        data = {'st_uid': 1000, 'st_gid': 1000}
        perm = 0o777

        if info['storageClass'] == 'DIRECTORY' or 'bucket' in info['kind']:
            data['st_atime'] = 0
            data['st_ctime'] = 0
            data['st_mtime'] = 0
            data['st_mode'] = (stat.S_IFDIR | perm)
            data['st_size'] = 0
            data['st_blksize'] = 0
        else:
            data['st_atime'] = str_to_time(info['timeStorageClassUpdated'])
            data['st_ctime'] = str_to_time(info['timeCreated'])
            data['st_mtime'] = str_to_time(info['updated'])
            data['st_mode'] = (stat.S_IFREG | perm)
            data['st_size'] = info['size']
            data['st_blksize'] = 5 * 2**20
            data['st_nlink'] = 1

        return data

    def readdir(self, path, fh):
        path = ''.join([self.root, path])
        files = self.gcs.ls(path)
        files = [f.rstrip('/').rsplit('/', 1)[1] for f in files]
        return ['.', '..'] + files

    def mkdir(self, path, mode):
        bucket, key = core.split_path(path)
        if not self.gcs.info(path):
            self.gcs.dirs['bucket'].append({
                'bucket': bucket,
                'kind': 'storage#object',
                'size': 0,
                'storageClass': 'DIRECTORY',
                'name': path.rstrip('/') + '/'
            })

    def rmdir(self, path):
        info = self.gcs.info(path)
        if info['storageClass':'DIRECTORY']:
            self.gcs.rm(path, False)

    def read(self, path, size, offset, fh):
        print('read', path, size, offset, fh)
        fn = ''.join([self.root, path])
        f = self.cache[fn]
        f.seek(offset)
        out = f.read(size)
        return out

    def write(self, path, data, offset, fh):
        print('write', path, offset, fh)
        f = self.cache[fh]
        f.write(data)
        return len(data)

    def create(self, path, flags):
        print('create', path, oct(flags))
        fn = ''.join([self.root, path])
        self.gcs.touch(
            fn)  # this makes sure directory entry exists - wasteful!
        # write (but ignore creation flags)
        f = self.gcs.open(fn, 'wb')
        self.cache[self.counter] = f
        self.counter += 1
        return self.counter - 1

    def open(self, path, flags):
        print('open', path, oct(flags))
        fn = ''.join([self.root, path])
        if flags % 2 == 0:
            # read
            f = self.gcs.open(fn, 'rb')
        else:
            # write (but ignore creation flags)
            f = self.gcs.open(fn, 'wb')
        self.cache[self.counter] = f
        self.counter += 1
        return self.counter - 1

    def truncate(self, path, length, fh=None):
        print('truncate', path, length, fh)
        fn = ''.join([self.root, path])
        if length != 0:
            raise NotImplementedError
        # maybe should be no-op since open with write sets size to zero anyway
        self.gcs.touch(fn)

    def unlink(self, path):
        print('delete', path)
        fn = ''.join([self.root, path])
        try:
            self.gcs.rm(fn, False)
        except (IOError, FileNotFoundError):
            raise FuseOSError(EIO)

    def release(self, path, fh):
        print('close', path, fh)
        try:
            f = self.cache[fh]
            f.close()
            self.cache.pop(fh, None)  # should release any cache memory
        except Exception as e:
            print(e)
        return 0

    def chmod(self, path, mode):
        raise NotImplementedError
Exemple #6
0
class GCSFS(Operations):

    def __init__(self, path='.', gcs=None, nfiles=10, **fsargs):
        if gcs is None:
            # minimum block size: still read on 5MB boundaries.
            self.gcs = GCSFileSystem(block_size=30 * 2 ** 20,
                                     cache_timeout=6000, **fsargs)
        else:
            self.gcs = gcs
        self.cache = SmallChunkCacher(self.gcs, nfiles=nfiles)
        self.write_cache = {}
        self.counter = 0
        self.root = path

    @_tracemethod
    def getattr(self, path, fh=None):
        path = ''.join([self.root, path])
        try:
            info = self.gcs.info(path)
        except FileNotFoundError:
            parent = path.rsplit('/', 1)[0]
            if path in self.gcs.ls(parent):
                info = True
            else:
                raise FuseOSError(ENOENT)
        data = {'st_uid': 1000, 'st_gid': 1000}
        perm = 0o777

        if (info is True or info['storageClass'] == 'DIRECTORY'
                or 'bucket' in info['kind']):
            data['st_atime'] = 0
            data['st_ctime'] = 0
            data['st_mtime'] = 0
            data['st_mode'] = (stat.S_IFDIR | perm)
            data['st_size'] = 0
            data['st_blksize'] = 0
        else:
            data['st_atime'] = str_to_time(info['timeStorageClassUpdated'])
            data['st_ctime'] = str_to_time(info['timeCreated'])
            data['st_mtime'] = str_to_time(info['updated'])
            data['st_mode'] = (stat.S_IFREG | perm)
            data['st_size'] = info['size']
            data['st_blksize'] = 5 * 2**20
            data['st_nlink'] = 1
        return data

    @_tracemethod
    def readdir(self, path, fh):
        path = ''.join([self.root, path])
        logger.info("List {}, {}".format(path, fh))
        files = self.gcs.ls(path)
        files = [os.path.basename(f.rstrip('/')) for f in files]
        return ['.', '..'] + files

    @_tracemethod
    def mkdir(self, path, mode):
        path = ''.join([self.root, path])
        logger.info("Mkdir {}".format(path))
        parent, name = path.rsplit('/', 1)
        prefixes = self.gcs._listing_cache[parent + '/'][1]['prefixes']
        if name not in prefixes:
            prefixes.append(name)
        return 0

    @_tracemethod
    def rmdir(self, path):
        info = self.gcs.info(path)
        if info['storageClass': 'DIRECTORY']:
            self.gcs.rm(path, False)

    @_tracemethod
    def read(self, path, size, offset, fh):
        fn = ''.join([self.root, path])
        logger.info('read #{} ({}) offset: {}, size: {}'.format(
            fh, fn, offset, size))
        out = self.cache.read(fn, offset, size)
        return out

    @_tracemethod
    def write(self, path, data, offset, fh):
        fn = ''.join([self.root, path])
        logger.info('write #{} ({}) offset'.format(fh, fn, offset))
        f = self.write_cache[fh]
        f.write(data)
        return len(data)

    @_tracemethod
    def create(self, path, flags):
        fn = ''.join([self.root, path])
        logger.info('create {} {}'.format(fn, oct(flags)))
        self.gcs.touch(fn)  # this makes sure directory entry exists - wasteful!
        # write (but ignore creation flags)
        f = self.gcs.open(fn, 'wb')
        self.write_cache[self.counter] = f
        logger.info('-> fh #{}'.format(self.counter))
        self.counter += 1
        return self.counter - 1

    @_tracemethod
    def open(self, path, flags):
        fn = ''.join([self.root, path])
        logger.info('open {} {}'.format(fn, oct(flags)))
        if flags % 2 == 0:
            # read
            self.cache.open(fn)
        else:
            # write (but ignore creation flags)
            self.gcs.open(fn, 'wb')
            self.write_cache[self.counter] = f
        logger.info('-> fh #{}'.format(self.counter))
        self.counter += 1
        return self.counter - 1

    @_tracemethod
    def truncate(self, path, length, fh=None):
        fn = ''.join([self.root, path])
        logger.info('truncate #{} ({}) to {}'.format(fh, fn, length))
        if length != 0:
            raise NotImplementedError
        # maybe should be no-op since open with write sets size to zero anyway
        self.gcs.touch(fn)

    @_tracemethod
    def unlink(self, path):
        fn = ''.join([self.root, path])
        logger.info('delete', fn)
        try:
            self.gcs.rm(fn, False)
        except (IOError, FileNotFoundError):
            raise FuseOSError(EIO)

    @_tracemethod
    def release(self, path, fh):
        fn = ''.join([self.root, path])
        logger.info('close #{} ({})'.format(fh, fn))
        try:
            if fh in self.write_cache:
                # write mode
                f = self.write_cache[fh]
                f.close()
                self.write_cache.pop(fh, None)
        except Exception as e:
            logger.exception("exception on release:" + str(e))
        return 0

    @_tracemethod
    def chmod(self, path, mode):
        raise NotImplementedError
Exemple #7
0
# auto-generate some GCS metrics

from gcsfs import GCSFileSystem

fs = GCSFileSystem('pangeo-181919')


# https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
def sizeof_fmt(num, suffix='B'):
    for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)


# get disk usage of each folder in gs://pangeo-data
with open('du-pangeo-data.csv', 'w') as f:
    f.write('directory, size, nbytes')
    print('directory, size, nbytes')
    for folder in fs.ls('pangeo-data'):
        nbytes = fs.du(folder)
        f.write(f'{folder}, {sizeof_fmt(nbytes)}, {nbytes}')
        print(f'{folder}, {sizeof_fmt(nbytes)}, {nbytes}')

# upload CSV to gs://pangeo-data
fs.put('du-pangeo-data.csv', 'pangeo-data/du-pangeo-data.csv')