def _clean_cdp_filestore(google_creds_path: Path) -> None: # Connect to database fs = GCSFileSystem(token=str(google_creds_path)) # Open the key to get the project id with open(google_creds_path, "r") as open_resource: creds = json.load(open_resource) project_id = creds["project_id"] # Remove all files in bucket bucket = f"{project_id}.appspot.com" log.info(f"Cleaning bucket: {bucket}") try: fs.rm(f"{bucket}/*") # Handle empty bucket except FileNotFoundError: pass log.info("Filestore cleaning complete")
class GCSFS(Operations): def __init__(self, path='.', gcs=None, **fsargs): if gcs is None: self.gcs = GCSFileSystem(**fsargs) else: self.gcs = gcs self.cache = {} self.counter = 0 self.root = path def getattr(self, path, fh=None): try: info = self.gcs.info(''.join([self.root, path])) except FileNotFoundError: raise FuseOSError(ENOENT) data = {'st_uid': 1000, 'st_gid': 1000} perm = 0o777 if info['storageClass'] == 'DIRECTORY' or 'bucket' in info['kind']: data['st_atime'] = 0 data['st_ctime'] = 0 data['st_mtime'] = 0 data['st_mode'] = (stat.S_IFDIR | perm) data['st_size'] = 0 data['st_blksize'] = 0 else: data['st_atime'] = str_to_time(info['timeStorageClassUpdated']) data['st_ctime'] = str_to_time(info['timeCreated']) data['st_mtime'] = str_to_time(info['updated']) data['st_mode'] = (stat.S_IFREG | perm) data['st_size'] = info['size'] data['st_blksize'] = 5 * 2**20 data['st_nlink'] = 1 return data def readdir(self, path, fh): path = ''.join([self.root, path]) files = self.gcs.ls(path) files = [f.rstrip('/').rsplit('/', 1)[1] for f in files] return ['.', '..'] + files def mkdir(self, path, mode): bucket, key = core.split_path(path) if not self.gcs.info(path): self.gcs.dirs['bucket'].append({ 'bucket': bucket, 'kind': 'storage#object', 'size': 0, 'storageClass': 'DIRECTORY', 'name': path.rstrip('/') + '/' }) def rmdir(self, path): info = self.gcs.info(path) if info['storageClass':'DIRECTORY']: self.gcs.rm(path, False) def read(self, path, size, offset, fh): print('read', path, size, offset, fh) fn = ''.join([self.root, path]) f = self.cache[fn] f.seek(offset) out = f.read(size) return out def write(self, path, data, offset, fh): print('write', path, offset, fh) f = self.cache[fh] f.write(data) return len(data) def create(self, path, flags): print('create', path, oct(flags)) fn = ''.join([self.root, path]) self.gcs.touch( fn) # this makes sure directory entry exists - wasteful! # write (but ignore creation flags) f = self.gcs.open(fn, 'wb') self.cache[self.counter] = f self.counter += 1 return self.counter - 1 def open(self, path, flags): print('open', path, oct(flags)) fn = ''.join([self.root, path]) if flags % 2 == 0: # read f = self.gcs.open(fn, 'rb') else: # write (but ignore creation flags) f = self.gcs.open(fn, 'wb') self.cache[self.counter] = f self.counter += 1 return self.counter - 1 def truncate(self, path, length, fh=None): print('truncate', path, length, fh) fn = ''.join([self.root, path]) if length != 0: raise NotImplementedError # maybe should be no-op since open with write sets size to zero anyway self.gcs.touch(fn) def unlink(self, path): print('delete', path) fn = ''.join([self.root, path]) try: self.gcs.rm(fn, False) except (IOError, FileNotFoundError): raise FuseOSError(EIO) def release(self, path, fh): print('close', path, fh) try: f = self.cache[fh] f.close() self.cache.pop(fh, None) # should release any cache memory except Exception as e: print(e) return 0 def chmod(self, path, mode): raise NotImplementedError
class GCSFS(Operations): def __init__(self, path='.', gcs=None, nfiles=10, **fsargs): if gcs is None: # minimum block size: still read on 5MB boundaries. self.gcs = GCSFileSystem(block_size=30 * 2 ** 20, cache_timeout=6000, **fsargs) else: self.gcs = gcs self.cache = SmallChunkCacher(self.gcs, nfiles=nfiles) self.write_cache = {} self.counter = 0 self.root = path @_tracemethod def getattr(self, path, fh=None): path = ''.join([self.root, path]) try: info = self.gcs.info(path) except FileNotFoundError: parent = path.rsplit('/', 1)[0] if path in self.gcs.ls(parent): info = True else: raise FuseOSError(ENOENT) data = {'st_uid': 1000, 'st_gid': 1000} perm = 0o777 if (info is True or info['storageClass'] == 'DIRECTORY' or 'bucket' in info['kind']): data['st_atime'] = 0 data['st_ctime'] = 0 data['st_mtime'] = 0 data['st_mode'] = (stat.S_IFDIR | perm) data['st_size'] = 0 data['st_blksize'] = 0 else: data['st_atime'] = str_to_time(info['timeStorageClassUpdated']) data['st_ctime'] = str_to_time(info['timeCreated']) data['st_mtime'] = str_to_time(info['updated']) data['st_mode'] = (stat.S_IFREG | perm) data['st_size'] = info['size'] data['st_blksize'] = 5 * 2**20 data['st_nlink'] = 1 return data @_tracemethod def readdir(self, path, fh): path = ''.join([self.root, path]) logger.info("List {}, {}".format(path, fh)) files = self.gcs.ls(path) files = [os.path.basename(f.rstrip('/')) for f in files] return ['.', '..'] + files @_tracemethod def mkdir(self, path, mode): path = ''.join([self.root, path]) logger.info("Mkdir {}".format(path)) parent, name = path.rsplit('/', 1) prefixes = self.gcs._listing_cache[parent + '/'][1]['prefixes'] if name not in prefixes: prefixes.append(name) return 0 @_tracemethod def rmdir(self, path): info = self.gcs.info(path) if info['storageClass': 'DIRECTORY']: self.gcs.rm(path, False) @_tracemethod def read(self, path, size, offset, fh): fn = ''.join([self.root, path]) logger.info('read #{} ({}) offset: {}, size: {}'.format( fh, fn, offset, size)) out = self.cache.read(fn, offset, size) return out @_tracemethod def write(self, path, data, offset, fh): fn = ''.join([self.root, path]) logger.info('write #{} ({}) offset'.format(fh, fn, offset)) f = self.write_cache[fh] f.write(data) return len(data) @_tracemethod def create(self, path, flags): fn = ''.join([self.root, path]) logger.info('create {} {}'.format(fn, oct(flags))) self.gcs.touch(fn) # this makes sure directory entry exists - wasteful! # write (but ignore creation flags) f = self.gcs.open(fn, 'wb') self.write_cache[self.counter] = f logger.info('-> fh #{}'.format(self.counter)) self.counter += 1 return self.counter - 1 @_tracemethod def open(self, path, flags): fn = ''.join([self.root, path]) logger.info('open {} {}'.format(fn, oct(flags))) if flags % 2 == 0: # read self.cache.open(fn) else: # write (but ignore creation flags) self.gcs.open(fn, 'wb') self.write_cache[self.counter] = f logger.info('-> fh #{}'.format(self.counter)) self.counter += 1 return self.counter - 1 @_tracemethod def truncate(self, path, length, fh=None): fn = ''.join([self.root, path]) logger.info('truncate #{} ({}) to {}'.format(fh, fn, length)) if length != 0: raise NotImplementedError # maybe should be no-op since open with write sets size to zero anyway self.gcs.touch(fn) @_tracemethod def unlink(self, path): fn = ''.join([self.root, path]) logger.info('delete', fn) try: self.gcs.rm(fn, False) except (IOError, FileNotFoundError): raise FuseOSError(EIO) @_tracemethod def release(self, path, fh): fn = ''.join([self.root, path]) logger.info('close #{} ({})'.format(fh, fn)) try: if fh in self.write_cache: # write mode f = self.write_cache[fh] f.close() self.write_cache.pop(fh, None) except Exception as e: logger.exception("exception on release:" + str(e)) return 0 @_tracemethod def chmod(self, path, mode): raise NotImplementedError
class DeltaReaderAppendTest(TestCase): @classmethod def setUpClass(self): self.path = f"tests/{str(uuid.uuid4())}/table1" self.spark = ( pyspark.sql.SparkSession.builder.appName("deltalake").config( "spark.jars.packages", "io.delta:delta-core_2.12:0.7.0").config( "spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension").config( "spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog", ).getOrCreate()) df = (self.spark.range(0, 1000).withColumn("number", rand()).withColumn( "number2", when(col("id") < 500, 0).otherwise(1))) for i in range(12): df.write.partitionBy("number2").format("delta").mode( "append").save(self.path) self.fs = GCSFileSystem(project=GCP_PROJECT_ID) self.fs.upload(self.path, f"{GCP_BUCKET}/{self.path}", recursive=True) self.table = DeltaTable(f"{GCP_BUCKET}/{self.path}", file_system=self.fs) @classmethod def tearDownClass(self): # remove folder when we are done with the test self.fs.rm(f"{GCP_BUCKET}/{self.path}", recursive=True) shutil.rmtree(self.path) def test_paths(self): assert self.table.path == f"{GCP_BUCKET}/{self.path}" assert self.table.log_path == f"{GCP_BUCKET}/{self.path}/_delta_log" def test_versions(self): assert self.table.checkpoint == 10 assert self.table.version == 11 def test_data(self): # read the parquet files using pandas df_pandas = self.table.to_pandas() # read the table using spark df_spark = self.spark.read.format("delta").load(self.path).toPandas() # compare dataframes. The index may not be the same order, so we ignore it assert_frame_equal( df_pandas.sort_values("id").reset_index(drop=True), df_spark.sort_values("id").reset_index(drop=True), ) def test_version(self): # read the parquet files using pandas df_pandas = self.table.as_version(5, inplace=False).to_pandas() # read the table using spark df_spark = (self.spark.read.format("delta").option( "versionAsOf", 5).load(self.path).toPandas()) # compare dataframes. The index may not be the same order, so we ignore it assert_frame_equal( df_pandas.sort_values("id").reset_index(drop=True), df_spark.sort_values("id").reset_index(drop=True), ) def test_partitioning(self): # Partition pruning should half number of rows assert self.table.to_table( filter=ds.field("number2") == 0).num_rows == 6000 def test_predicate_pushdown(self): # number is random 0-1, so we should have fewer than 12000 rows no matter what assert self.table.to_table( filter=ds.field("number") < 0.5).num_rows < 12000 def test_column_pruning(self): t = self.table.to_table(columns=["number", "number2"]) assert t.column_names == ["number", "number2"]