def validated_gcs_bucket_name(self) -> str: if self._validated_gcs_bucket_name is None: if self.gcs_bucket_name is not None: bucket = self.gcs_bucket_name else: # Open the key to get the project id with open(self.google_credentials_file, "r") as open_resource: creds = json.load(open_resource) project_id = creds["project_id"] # Remove all files in bucket bucket = f"{project_id}.appspot.com" # Validate fs = GCSFileSystem(token=self.google_credentials_file) try: fs.ls(bucket) self._validated_gcs_bucket_name = bucket except FileNotFoundError: raise ValueError( f"Provided or infered GCS bucket name does not exist. ('{bucket}')" ) return self._validated_gcs_bucket_name
def open_and_combine_lat_lon_data(folder, tiles=None): """ Load lat lon data stored as 10x10 degree tiles in folder If tiles is none, load all data available If no file is available, return None """ fs = GCSFileSystem(cache_timeout=0) if not tiles: tiles = [ os.path.splitext(os.path.split(path)[-1])[0] for path in fs.ls(folder) if not path.endswith('/') ] uris = [f'{folder}{tile}.zarr' for tile in tiles] ds_list = [] for uri in uris: if fs.exists(uri): da = open_zarr_file(uri) if da.lat[0] > da.lat[-1]: da = da.reindex(lat=da.lat[::-1]) if da.lon[0] > da.lon[-1]: da = da.reindex(lat=da.lon[::-1]) ds_list.append(da) if len(ds_list) > 0: ds = xr.combine_by_coords(ds_list, combine_attrs="drop_conflicts").chunk({ 'lat': 2000, 'lon': 2000 }) return ds # print(f'No data available at {folder} for tiles {tiles}') return None
def find_tiles_for_bounding_box(min_lat, max_lat, min_lon, max_lon): """ return a list of 10x10 degree tile names covering the bounding box the tile names are in the format of {lat}_{lon} where lat, lon represent the upper left corner ocean tiles are removed """ fs = GCSFileSystem(cache_timeout=0) folder = 'gs://carbonplan-climatetrace/intermediates/ecoregions_mask/' available_tiles = [ os.path.splitext(os.path.split(path)[-1])[0] for path in fs.ls(folder) if not path.endswith('/') ] step = 10 lat_start = math.ceil(min_lat / step) * step lat_stop = math.ceil(max_lat / step) * step all_lat_tiles = np.arange(start=lat_start, stop=lat_stop + 1, step=step) if min_lat == lat_start: all_lat_tiles = all_lat_tiles[1:] lon_start = math.floor(min_lon / step) * step lon_stop = math.floor(max_lon / step) * step all_lon_tiles = np.arange(start=lon_start, stop=lon_stop + 1, step=step) if max_lon == lon_stop: all_lon_tiles = all_lon_tiles[:-1] out = [] for lat in all_lat_tiles: for lon in all_lon_tiles: lat_tag, lon_tag = get_lat_lon_tags_from_bounding_box(lat, lon) fn = f'{lat_tag}_{lon_tag}' if fn in available_tiles: out.append(fn) return out
def open_glah01_data(): fs = GCSFileSystem(cache_timeout=0) uris = [ f'gs://{f}' for f in fs.ls('gs://carbonplan-climatetrace/intermediates/glah01/') if not f.endswith('/') ] ds_list = [open_zarr_file(uri) for uri in uris] ds = xr.concat(ds_list, dim='record_index').chunk({'record_index': 2000}) for k in ds: _ = ds[k].encoding.pop('chunks', None) return ds
class GCSFS(Operations): def __init__(self, path='.', gcs=None, **fsargs): if gcs is None: self.gcs = GCSFileSystem(**fsargs) else: self.gcs = gcs self.cache = {} self.counter = 0 self.root = path def getattr(self, path, fh=None): try: info = self.gcs.info(''.join([self.root, path])) except FileNotFoundError: raise FuseOSError(ENOENT) data = {'st_uid': 1000, 'st_gid': 1000} perm = 0o777 if info['storageClass'] == 'DIRECTORY' or 'bucket' in info['kind']: data['st_atime'] = 0 data['st_ctime'] = 0 data['st_mtime'] = 0 data['st_mode'] = (stat.S_IFDIR | perm) data['st_size'] = 0 data['st_blksize'] = 0 else: data['st_atime'] = str_to_time(info['timeStorageClassUpdated']) data['st_ctime'] = str_to_time(info['timeCreated']) data['st_mtime'] = str_to_time(info['updated']) data['st_mode'] = (stat.S_IFREG | perm) data['st_size'] = info['size'] data['st_blksize'] = 5 * 2**20 data['st_nlink'] = 1 return data def readdir(self, path, fh): path = ''.join([self.root, path]) files = self.gcs.ls(path) files = [f.rstrip('/').rsplit('/', 1)[1] for f in files] return ['.', '..'] + files def mkdir(self, path, mode): bucket, key = core.split_path(path) if not self.gcs.info(path): self.gcs.dirs['bucket'].append({ 'bucket': bucket, 'kind': 'storage#object', 'size': 0, 'storageClass': 'DIRECTORY', 'name': path.rstrip('/') + '/' }) def rmdir(self, path): info = self.gcs.info(path) if info['storageClass':'DIRECTORY']: self.gcs.rm(path, False) def read(self, path, size, offset, fh): print('read', path, size, offset, fh) fn = ''.join([self.root, path]) f = self.cache[fn] f.seek(offset) out = f.read(size) return out def write(self, path, data, offset, fh): print('write', path, offset, fh) f = self.cache[fh] f.write(data) return len(data) def create(self, path, flags): print('create', path, oct(flags)) fn = ''.join([self.root, path]) self.gcs.touch( fn) # this makes sure directory entry exists - wasteful! # write (but ignore creation flags) f = self.gcs.open(fn, 'wb') self.cache[self.counter] = f self.counter += 1 return self.counter - 1 def open(self, path, flags): print('open', path, oct(flags)) fn = ''.join([self.root, path]) if flags % 2 == 0: # read f = self.gcs.open(fn, 'rb') else: # write (but ignore creation flags) f = self.gcs.open(fn, 'wb') self.cache[self.counter] = f self.counter += 1 return self.counter - 1 def truncate(self, path, length, fh=None): print('truncate', path, length, fh) fn = ''.join([self.root, path]) if length != 0: raise NotImplementedError # maybe should be no-op since open with write sets size to zero anyway self.gcs.touch(fn) def unlink(self, path): print('delete', path) fn = ''.join([self.root, path]) try: self.gcs.rm(fn, False) except (IOError, FileNotFoundError): raise FuseOSError(EIO) def release(self, path, fh): print('close', path, fh) try: f = self.cache[fh] f.close() self.cache.pop(fh, None) # should release any cache memory except Exception as e: print(e) return 0 def chmod(self, path, mode): raise NotImplementedError
class GCSFS(Operations): def __init__(self, path='.', gcs=None, nfiles=10, **fsargs): if gcs is None: # minimum block size: still read on 5MB boundaries. self.gcs = GCSFileSystem(block_size=30 * 2 ** 20, cache_timeout=6000, **fsargs) else: self.gcs = gcs self.cache = SmallChunkCacher(self.gcs, nfiles=nfiles) self.write_cache = {} self.counter = 0 self.root = path @_tracemethod def getattr(self, path, fh=None): path = ''.join([self.root, path]) try: info = self.gcs.info(path) except FileNotFoundError: parent = path.rsplit('/', 1)[0] if path in self.gcs.ls(parent): info = True else: raise FuseOSError(ENOENT) data = {'st_uid': 1000, 'st_gid': 1000} perm = 0o777 if (info is True or info['storageClass'] == 'DIRECTORY' or 'bucket' in info['kind']): data['st_atime'] = 0 data['st_ctime'] = 0 data['st_mtime'] = 0 data['st_mode'] = (stat.S_IFDIR | perm) data['st_size'] = 0 data['st_blksize'] = 0 else: data['st_atime'] = str_to_time(info['timeStorageClassUpdated']) data['st_ctime'] = str_to_time(info['timeCreated']) data['st_mtime'] = str_to_time(info['updated']) data['st_mode'] = (stat.S_IFREG | perm) data['st_size'] = info['size'] data['st_blksize'] = 5 * 2**20 data['st_nlink'] = 1 return data @_tracemethod def readdir(self, path, fh): path = ''.join([self.root, path]) logger.info("List {}, {}".format(path, fh)) files = self.gcs.ls(path) files = [os.path.basename(f.rstrip('/')) for f in files] return ['.', '..'] + files @_tracemethod def mkdir(self, path, mode): path = ''.join([self.root, path]) logger.info("Mkdir {}".format(path)) parent, name = path.rsplit('/', 1) prefixes = self.gcs._listing_cache[parent + '/'][1]['prefixes'] if name not in prefixes: prefixes.append(name) return 0 @_tracemethod def rmdir(self, path): info = self.gcs.info(path) if info['storageClass': 'DIRECTORY']: self.gcs.rm(path, False) @_tracemethod def read(self, path, size, offset, fh): fn = ''.join([self.root, path]) logger.info('read #{} ({}) offset: {}, size: {}'.format( fh, fn, offset, size)) out = self.cache.read(fn, offset, size) return out @_tracemethod def write(self, path, data, offset, fh): fn = ''.join([self.root, path]) logger.info('write #{} ({}) offset'.format(fh, fn, offset)) f = self.write_cache[fh] f.write(data) return len(data) @_tracemethod def create(self, path, flags): fn = ''.join([self.root, path]) logger.info('create {} {}'.format(fn, oct(flags))) self.gcs.touch(fn) # this makes sure directory entry exists - wasteful! # write (but ignore creation flags) f = self.gcs.open(fn, 'wb') self.write_cache[self.counter] = f logger.info('-> fh #{}'.format(self.counter)) self.counter += 1 return self.counter - 1 @_tracemethod def open(self, path, flags): fn = ''.join([self.root, path]) logger.info('open {} {}'.format(fn, oct(flags))) if flags % 2 == 0: # read self.cache.open(fn) else: # write (but ignore creation flags) self.gcs.open(fn, 'wb') self.write_cache[self.counter] = f logger.info('-> fh #{}'.format(self.counter)) self.counter += 1 return self.counter - 1 @_tracemethod def truncate(self, path, length, fh=None): fn = ''.join([self.root, path]) logger.info('truncate #{} ({}) to {}'.format(fh, fn, length)) if length != 0: raise NotImplementedError # maybe should be no-op since open with write sets size to zero anyway self.gcs.touch(fn) @_tracemethod def unlink(self, path): fn = ''.join([self.root, path]) logger.info('delete', fn) try: self.gcs.rm(fn, False) except (IOError, FileNotFoundError): raise FuseOSError(EIO) @_tracemethod def release(self, path, fh): fn = ''.join([self.root, path]) logger.info('close #{} ({})'.format(fh, fn)) try: if fh in self.write_cache: # write mode f = self.write_cache[fh] f.close() self.write_cache.pop(fh, None) except Exception as e: logger.exception("exception on release:" + str(e)) return 0 @_tracemethod def chmod(self, path, mode): raise NotImplementedError
# auto-generate some GCS metrics from gcsfs import GCSFileSystem fs = GCSFileSystem('pangeo-181919') # https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size def sizeof_fmt(num, suffix='B'): for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: if abs(num) < 1024.0: return "%3.1f%s%s" % (num, unit, suffix) num /= 1024.0 return "%.1f%s%s" % (num, 'Yi', suffix) # get disk usage of each folder in gs://pangeo-data with open('du-pangeo-data.csv', 'w') as f: f.write('directory, size, nbytes') print('directory, size, nbytes') for folder in fs.ls('pangeo-data'): nbytes = fs.du(folder) f.write(f'{folder}, {sizeof_fmt(nbytes)}, {nbytes}') print(f'{folder}, {sizeof_fmt(nbytes)}, {nbytes}') # upload CSV to gs://pangeo-data fs.put('du-pangeo-data.csv', 'pangeo-data/du-pangeo-data.csv')